Skip to content

Commit 35d2de6

Browse files
committed
add type annotations for regex
1 parent 1c50959 commit 35d2de6

File tree

4 files changed

+168
-159
lines changed

4 files changed

+168
-159
lines changed

pyformlang/regular_expression/python_regex.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@
22
A class to read Python format regex
33
"""
44

5-
import re
6-
import string
7-
import unicodedata
5+
from typing import Union
6+
from re import compile as comp, Pattern
7+
from string import printable
8+
from unicodedata import lookup
89

9-
# pylint: disable=cyclic-import
10-
from pyformlang.regular_expression import regex, MisformedRegexError
10+
from pyformlang.regular_expression import MisformedRegexError
11+
from pyformlang.regular_expression.regex import Regex
1112
from pyformlang.regular_expression.regex_reader import \
1213
WRONG_PARENTHESIS_MESSAGE
1314

14-
PRINTABLES = list(string.printable)
15+
PRINTABLES = list(printable)
1516

1617
TRANSFORMATIONS = {
1718
"|": "\\|",
@@ -55,7 +56,7 @@
5556
ESCAPED_OCTAL = ["\\0", "\\1", "\\2", "\\3", "\\4", "\\5", "\\6", "\\7"]
5657

5758

58-
class PythonRegex(regex.Regex):
59+
class PythonRegex(Regex):
5960
""" Represents a regular expression as used in Python.
6061
6162
It adds the following features to the basic regex:
@@ -98,11 +99,11 @@ class PythonRegex(regex.Regex):
9899
99100
"""
100101

101-
def __init__(self, python_regex):
102-
if not isinstance(python_regex, str):
103-
python_regex = python_regex.pattern
102+
def __init__(self, python_regex: Union[str, Pattern[str]]) -> None:
103+
if isinstance(python_regex, str):
104+
comp(python_regex) # Check if it is valid
104105
else:
105-
re.compile(python_regex) # Check if it is valid
106+
python_regex = python_regex.pattern
106107

107108
self._python_regex = python_regex
108109
self._replace_shortcuts()
@@ -114,7 +115,7 @@ def __init__(self, python_regex):
114115
self._python_regex = self._python_regex.lstrip('\b')
115116
super().__init__(self._python_regex)
116117

117-
def _separate(self):
118+
def _separate(self) -> None:
118119
regex_temp = []
119120
for symbol in self._python_regex:
120121
if self._should_escape_next_symbol(regex_temp):
@@ -193,7 +194,7 @@ def _recombine(regex_to_recombine):
193194
while regex_to_recombine[idx_end] != "}":
194195
idx_end += 1
195196
name = "".join(regex_to_recombine[idx + 2: idx_end])
196-
name = unicodedata.lookup(name)
197+
name = lookup(name)
197198
temp.append(TRANSFORMATIONS.get(name, name))
198199
idx = idx_end + 1
199200
elif regex_to_recombine[idx] == "\\u":

pyformlang/regular_expression/regex.py

Lines changed: 59 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
"""
22
Representation of a regular expression
33
"""
4-
from typing import Iterable
5-
6-
from pyformlang import finite_automaton
7-
# pylint: disable=cyclic-import
8-
import pyformlang.regular_expression.regex_objects
9-
from pyformlang import cfg
10-
from pyformlang.finite_automaton import State
11-
# pylint: disable=cyclic-import
4+
5+
from typing import List, Iterable, Tuple, Any
6+
7+
from pyformlang.finite_automaton import Epsilon as FAEpsilon
8+
from pyformlang.finite_automaton import EpsilonNFA, State, Symbol
9+
from pyformlang.cfg.cfg import CFG, Production
10+
from pyformlang.cfg.utils import to_variable
1211
from pyformlang.regular_expression.regex_reader import RegexReader
13-
from pyformlang import regular_expression
12+
from pyformlang.regular_expression.python_regex import PythonRegex
13+
from pyformlang.regular_expression.regex_objects import \
14+
Epsilon as RegexEpsilon, Empty, Concatenation, Union, KleeneStar
1415

1516

1617
class Regex(RegexReader):
@@ -85,16 +86,11 @@ class Regex(RegexReader):
8586
8687
"""
8788

88-
def __init__(self, regex):
89-
self.head = None
90-
self.sons = None
89+
def __init__(self, regex: str) -> None:
9190
super().__init__(regex)
91+
self.sons: List[Regex] = []
9292
self._counter = 0
93-
self._initialize_enfa()
94-
self._enfa = None
95-
96-
def _initialize_enfa(self):
97-
self._enfa = finite_automaton.EpsilonNFA()
93+
self._enfa = EpsilonNFA()
9894

9995
def get_number_symbols(self) -> int:
10096
""" Gives the number of symbols in the regex
@@ -139,7 +135,7 @@ def get_number_operators(self) -> int:
139135
return 1 + sum(son.get_number_operators() for son in self.sons)
140136
return 0
141137

142-
def to_epsilon_nfa(self):
138+
def to_epsilon_nfa(self) -> EpsilonNFA:
143139
""" Transforms the regular expression into an epsilon NFA
144140
145141
Returns
@@ -154,28 +150,28 @@ def to_epsilon_nfa(self):
154150
>>> regex.to_epsilon_nfa()
155151
156152
"""
157-
self._initialize_enfa()
153+
self._enfa = EpsilonNFA()
158154
s_initial = self._set_and_get_initial_state_in_enfa()
159155
s_final = self._set_and_get_final_state_in_enfa()
160156
self._process_to_enfa(s_initial, s_final)
161157
return self._enfa
162158

163-
def _set_and_get_final_state_in_enfa(self):
159+
def _set_and_get_final_state_in_enfa(self) -> State:
164160
s_final = self._get_next_state_enfa()
165161
self._enfa.add_final_state(s_final)
166162
return s_final
167163

168-
def _get_next_state_enfa(self):
169-
s_final = finite_automaton.State(self._counter)
164+
def _get_next_state_enfa(self) -> State:
165+
s_final = State(self._counter)
170166
self._counter += 1
171167
return s_final
172168

173-
def _set_and_get_initial_state_in_enfa(self):
169+
def _set_and_get_initial_state_in_enfa(self) -> State:
174170
s_initial = self._get_next_state_enfa()
175171
self._enfa.add_start_state(s_initial)
176172
return s_initial
177173

178-
def _process_to_enfa(self, s_from: State, s_to: State):
174+
def _process_to_enfa(self, s_from: State, s_to: State) -> None:
179175
""" Internal function to add a regex to a given epsilon NFA
180176
181177
Parameters
@@ -190,29 +186,24 @@ def _process_to_enfa(self, s_from: State, s_to: State):
190186
else:
191187
self._process_to_enfa_when_no_son(s_from, s_to)
192188

193-
def _process_to_enfa_when_no_son(self, s_from, s_to):
194-
if isinstance(self.head,
195-
pyformlang.regular_expression.regex_objects.Epsilon):
189+
def _process_to_enfa_when_no_son(self, s_from: State, s_to: State) -> None:
190+
if isinstance(self.head, RegexEpsilon):
196191
self._add_epsilon_transition_in_enfa_between(s_from, s_to)
197-
elif not isinstance(self.head,
198-
pyformlang.regular_expression.regex_objects.Empty):
199-
symbol = finite_automaton.Symbol(self.head.value)
192+
elif not isinstance(self.head, Empty):
193+
symbol = Symbol(self.head.value)
200194
self._enfa.add_transition(s_from, symbol, s_to)
201195

202-
def _process_to_enfa_when_sons(self, s_from, s_to):
196+
def _process_to_enfa_when_sons(self, s_from: State, s_to: State) -> None:
203197
if isinstance(
204-
self.head,
205-
pyformlang.regular_expression.regex_objects.Concatenation):
198+
self.head, Concatenation):
206199
self._process_to_enfa_concatenation(s_from, s_to)
207-
elif isinstance(self.head,
208-
pyformlang.regular_expression.regex_objects.Union):
200+
elif isinstance(self.head, Union):
209201
self._process_to_enfa_union(s_from, s_to)
210202
elif isinstance(
211-
self.head,
212-
pyformlang.regular_expression.regex_objects.KleeneStar):
203+
self.head, KleeneStar):
213204
self._process_to_enfa_kleene_star(s_from, s_to)
214205

215-
def _process_to_enfa_kleene_star(self, s_from, s_to):
206+
def _process_to_enfa_kleene_star(self, s_from: State, s_to: State) -> None:
216207
# pylint: disable=protected-access
217208
state_first = self._get_next_state_enfa()
218209
state_second = self._get_next_state_enfa()
@@ -222,30 +213,40 @@ def _process_to_enfa_kleene_star(self, s_from, s_to):
222213
self._add_epsilon_transition_in_enfa_between(state_second, s_to)
223214
self._process_to_enfa_son(state_first, state_second, 0)
224215

225-
def _process_to_enfa_union(self, s_from, s_to):
216+
def _process_to_enfa_union(self, s_from: State, s_to: State) -> None:
226217
son_number = 0
227218
self._create_union_branch_in_enfa(s_from, s_to, son_number)
228219
son_number = 1
229220
self._create_union_branch_in_enfa(s_from, s_to, son_number)
230221

231-
def _create_union_branch_in_enfa(self, s_from, s_to, son_number):
222+
def _create_union_branch_in_enfa(self,
223+
s_from: State,
224+
s_to: State,
225+
son_number: int) -> None:
232226
state0 = self._get_next_state_enfa()
233227
state2 = self._get_next_state_enfa()
234228
self._add_epsilon_transition_in_enfa_between(s_from, state0)
235229
self._add_epsilon_transition_in_enfa_between(state2, s_to)
236230
self._process_to_enfa_son(state0, state2, son_number)
237231

238-
def _process_to_enfa_concatenation(self, s_from, s_to):
232+
def _process_to_enfa_concatenation(self,
233+
s_from: State,
234+
s_to: State) -> None:
239235
state0 = self._get_next_state_enfa()
240236
state1 = self._get_next_state_enfa()
241237
self._add_epsilon_transition_in_enfa_between(state0, state1)
242238
self._process_to_enfa_son(s_from, state0, 0)
243239
self._process_to_enfa_son(state1, s_to, 1)
244240

245-
def _add_epsilon_transition_in_enfa_between(self, state0, state1):
246-
self._enfa.add_transition(state0, finite_automaton.Epsilon(), state1)
241+
def _add_epsilon_transition_in_enfa_between(self,
242+
state0: State,
243+
state1: State) -> None:
244+
self._enfa.add_transition(state0, FAEpsilon(), state1)
247245

248-
def _process_to_enfa_son(self, s_from, s_to, index_son):
246+
def _process_to_enfa_son(self,
247+
s_from: State,
248+
s_to: State,
249+
index_son: int) -> None:
249250
# pylint: disable=protected-access
250251
self.sons[index_son]._counter = self._counter
251252
self.sons[index_son]._enfa = self._enfa
@@ -280,7 +281,7 @@ def get_tree_str(self, depth: int = 0) -> str:
280281
temp += son.get_tree_str(depth + 1)
281282
return temp
282283

283-
def to_cfg(self, starting_symbol="S") -> "CFG":
284+
def to_cfg(self, starting_symbol: str = "S") -> CFG:
284285
"""
285286
Turns the regex into a context-free grammar
286287
@@ -304,11 +305,12 @@ def to_cfg(self, starting_symbol="S") -> "CFG":
304305
305306
"""
306307
productions, _ = self._get_production(starting_symbol)
307-
cfg_res = cfg.CFG(start_symbol=cfg.utils.to_variable(starting_symbol),
308+
cfg_res = CFG(start_symbol=to_variable(starting_symbol),
308309
productions=set(productions))
309310
return cfg_res
310311

311-
def _get_production(self, current_symbol, count=0):
312+
def _get_production(self, current_symbol: Any, count: int = 0) \
313+
-> Tuple[List[Production], int]:
312314
next_symbols = []
313315
next_productions = []
314316
for son in self.sons:
@@ -322,7 +324,7 @@ def _get_production(self, current_symbol, count=0):
322324
next_productions += new_prods
323325
return next_productions, count
324326

325-
def __repr__(self):
327+
def __repr__(self) -> str:
326328
return self.head.get_str_repr([str(son) for son in self.sons])
327329

328330
def union(self, other: "Regex") -> "Regex":
@@ -357,11 +359,11 @@ def union(self, other: "Regex") -> "Regex":
357359
358360
"""
359361
regex = Regex("")
360-
regex.head = pyformlang.regular_expression.regex_objects.Union()
362+
regex.head = Union()
361363
regex.sons = [self, other]
362364
return regex
363365

364-
def __or__(self, other):
366+
def __or__(self, other: "Regex") -> "Regex":
365367
""" Makes the union with another regex
366368
367369
Parameters
@@ -427,12 +429,11 @@ def concatenate(self, other: "Regex") -> "Regex":
427429
True
428430
"""
429431
regex = Regex("")
430-
regex.head = \
431-
pyformlang.regular_expression.regex_objects.Concatenation()
432+
regex.head = Concatenation()
432433
regex.sons = [self, other]
433434
return regex
434435

435-
def __add__(self, other):
436+
def __add__(self, other: "Regex") -> "Regex":
436437
""" Concatenates a regular expression with an other one
437438
438439
Parameters
@@ -485,11 +486,11 @@ def kleene_star(self) -> "Regex":
485486
486487
"""
487488
regex = Regex("")
488-
regex.head = pyformlang.regular_expression.regex_objects.KleeneStar()
489+
regex.head = KleeneStar()
489490
regex.sons = [self]
490491
return regex
491492

492-
def from_string(self, regex_str: str):
493+
def from_string(self, regex_str: str) -> "Regex":
493494
""" Construct a regex from a string. For internal usage.
494495
495496
Equivalent to the constructor of Regex
@@ -515,7 +516,7 @@ def from_string(self, regex_str: str):
515516
"""
516517
return Regex(regex_str)
517518

518-
def accepts(self, word: Iterable[str]) -> bool:
519+
def accepts(self, word: Iterable[Any]) -> bool:
519520
"""
520521
Check if a word matches (completely) the regex
521522
@@ -545,7 +546,7 @@ def accepts(self, word: Iterable[str]) -> bool:
545546
return self._enfa.accepts(word)
546547

547548
@classmethod
548-
def from_python_regex(cls, regex):
549+
def from_python_regex(cls, regex: str) -> PythonRegex:
549550
"""
550551
Creates a regex from a string using the python way to write it.
551552
@@ -570,4 +571,4 @@ def from_python_regex(cls, regex):
570571
>>> Regex.from_python_regex("a+[cd]")
571572
572573
"""
573-
return regular_expression.PythonRegex(regex)
574+
return PythonRegex(regex)

0 commit comments

Comments
 (0)