aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBobby <[email protected]>2022-09-01 19:51:18 -0400
committerBobby <[email protected]>2022-09-01 19:51:18 -0400
commit98024bb46f3f441e11f7d531837ca765309817cf (patch)
treee538c7ae20a12fb42bf4ae53f85d3d84f861f23e /src
parent9740e3756b7dc91808295341a02274350fb6c05d (diff)
downloadedify-98024bb46f3f441e11f7d531837ca765309817cf.tar.xz
edify-98024bb46f3f441e11f7d531837ca765309817cf.zip
Regex Builder class
Diffstat (limited to 'src')
-rw-r--r--src/edify/builder/abc.py6
-rw-r--r--src/edify/builder/builder.py186
-rw-r--r--src/edify/builder/constants.py10
-rw-r--r--src/edify/builder/escaped.py9
-rw-r--r--src/edify/builder/helpers/core.py24
-rw-r--r--src/edify/builder/helpers/t.py2
-rw-r--r--src/edify/builder/quantifier.py58
7 files changed, 191 insertions, 104 deletions
diff --git a/src/edify/builder/abc.py b/src/edify/builder/abc.py
deleted file mode 100644
index 4d6b088..0000000
--- a/src/edify/builder/abc.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from abc import ABC
-
-
-class Builder(ABC):
- def __str__(self):
- return self.build()
diff --git a/src/edify/builder/builder.py b/src/edify/builder/builder.py
index 6213bb6..0914920 100644
--- a/src/edify/builder/builder.py
+++ b/src/edify/builder/builder.py
@@ -23,9 +23,10 @@ from .helpers.core import apply_subexpression_defaults
from .helpers.core import assertion
from .helpers.core import create_stack_frame
from .helpers.core import deep_copy
-from .helpers.core import escape_special
+from .helpers.core import escape_special, fuse_elements
from .helpers.regex_vars import named_group_regex
from .helpers.t import t
+from .helpers.quantifiers import quantifier_table
class RegexBuilder:
@@ -39,46 +40,52 @@ class RegexBuilder:
'has_defined_start': False,
'has_defined_end': False,
'flags': {
- 'g': False,
- 'y': False,
- 'm': False,
- 'i': False,
- 'u': False,
- 's': False,
+ 'A': False,
+ 'DEBUG': False,
+ 'L': False,
+ 'I': False,
+ 'M': False,
+ 'S': False,
+ 'V': False,
},
'stack': create_stack_frame(t['root']),
'named_groups': [],
'total_capture_groups': 0,
}
- def allow_multiple_matches(self):
+ def ascii_only(self):
next = clone(self)
- next.state['stack']['flags']['g'] = True
+ next.state['stack']['flags']['A'] = True
return next
- def sticky(self):
+ def debug(self):
next = clone(self)
- next.state['stack']['flags']['y'] = True
+ next.state['stack']['flags']['DEBUG'] = True
return next
- def line_by_line(self):
+ def locale(self):
next = clone(self)
- next.state['stack']['flags']['m'] = True
+ next.state['stack']['flags']['L'] = True
return next
- def case_insensitive(self):
+ def ignore_case(self):
next = clone(self)
- next.state['stack']['flags']['i'] = True
+ next.state['stack']['flags']['I'] = True
return next
- def unicode(self):
+ def multi_line(self):
next = clone(self)
- next.state['stack']['flags']['u'] = True
+ next.state['stack']['flags']['M'] = True
return next
- def single_line(self):
+ def dot_all(self):
next = clone(self)
- next.state['stack']['flags']['s'] = True
+ next.state['stack']['flags']['S'] = True
+ return next
+
+ def verbose(self):
+ next = clone(self)
+ next.state['stack']['flags']['V'] = True
return next
def get_current_frame(self):
@@ -365,7 +372,7 @@ class RegexBuilder:
if next_el['contains_child']:
next_el['value'] = self.merge_subexpression(next_el['value'], options, parent, increment_capture_groups)
elif next_el['contains_children']:
- next_el['value'] = next_el['value'].map(lambda e: self.merge_subexpression(e, options, parent, increment_capture_groups))
+ next_el['value'] = list(map(lambda e: self.merge_subexpression(e, options, parent, increment_capture_groups), next_el['value']))
if next_el['type'] == 'start_of_input':
if options['ignore_start_and_end']:
return t['noop']
@@ -383,4 +390,143 @@ class RegexBuilder:
assertion(isinstance(expr, RegexBuilder), must_be_instance("Expression", expr, "RegexBuilder"))
assertion(len(expr['state']['stack']) == 1, can_not_call_se(expr.get_current_frame()['type']['type']))
options = apply_subexpression_defaults(opts)
+ expr_next = clone(expr)
+ next = clone(self)
+ additional_capture_groups = 0
+ expr_frame = expr_next.get_current_frame()
+
+ def increment_capture_groups():
+ nonlocal additional_capture_groups
+ additional_capture_groups += 1
+ expr_frame['elements'] = list(map(
+ lambda e: self.merge_subexpression(e, options, expr_next, increment_capture_groups), expr_frame['elements']))
+ next.state['total_capture_groups'] += additional_capture_groups
+ if not options['ignore_flags']:
+ for flag_name, enabled in expr_next.state['flags'].items():
+ next.state['flags'][flag_name] = enabled or next.state['flags'][flag_name]
+ current_frame = next.get_current_frame()
+ current_frame['elements'].append(next.apply_quantifier(t['subexpression'](expr_frame['elements'])))
+ return next
+ def evaluate(self, el):
+ if el['type'] == 'noop':
+ return ''
+ if el['type'] == 'any_char':
+ return '.'
+ if el['type'] == 'whitespace_char':
+ return '\s'
+ if el['type'] == 'non_whitespace_char':
+ return '\S'
+ if el['type'] == 'digit':
+ return '\d'
+ if el['type'] == 'non_digit':
+ return '\D'
+ if el['type'] == 'word':
+ return '\w'
+ if el['type'] == 'non_word':
+ return '\W'
+ if el['type'] == 'word_boundary':
+ return '\\b'
+ if el['type'] == 'non_word_boundary':
+ return '\B'
+ if el['type'] == 'start_of_input':
+ return '^'
+ if el['type'] == 'end_of_input':
+ return '$'
+ if el['type'] == 'newline':
+ return '\\n'
+ if el['type'] == 'carriage_return':
+ return '\\r'
+ if el['type'] == 'tab':
+ return '\\t'
+ if el['type'] == 'null_byte':
+ return '\\0'
+ if el['type'] == 'string':
+ return el['value']
+ if el['type'] == 'char':
+ return el['value']
+ if el['type'] == 'range':
+ return '[{}-{}]'.format(el['value'][0], el['value'][1])
+ if el['type'] == 'anything_but_range':
+ return '[^{}-{}]'.format(el['value'][0], el['value'][1])
+ if el['type'] == 'any_of_chars':
+ return '[' + ''.join(el['value']) + ']'
+ if el['type'] == 'anything_but_chars':
+ return '[^' + ''.join(el['value']) + ']'
+ if el['type'] == 'named_back_reference':
+ return '\\k<{}>'.format(el['name'])
+ if el['type'] == 'back_reference':
+ return '\\{}'.format(el['index'])
+ if el['type'] == 'subexpression':
+ return ''.join(map(lambda e: self.evaluate(e), el['value']))
+ cg1 = ['optional', 'zero_or_more', 'zero_or_more_lazy', 'one_or_more', 'one_or_more_lazy']
+ if el['type'] in cg1:
+ inner = self.evaluate(el['value'])
+ with_group = "(?:{})".format(inner) if el['value']['quantifiers_require_group'] else inner
+ symbol = quantifier_table[el['type']]
+ return '{}{}'.format(with_group, symbol)
+ cg2 = ['between', 'between_lazy', 'at_least', 'exactly']
+ if el['type'] in cg2:
+ inner = self.evaluate(el['value'])
+ with_group = "(?:{})".format(inner) if el['value']['quantifiers_require_group'] else inner
+ return '{}{}'.format(with_group, quantifier_table[el['type']](el['times']))
+ if el['type'] == 'anything_but_string':
+ chars = ''.join(map(lambda c: '[^{}]'.format(c), el['value']))
+ return '(?:{})'.format(chars)
+ if el['type'] == 'assert_ahead':
+ evaluated = ''.join(map(lambda e: self.evaluate(e), el['value']))
+ return '(?={})'.format(evaluated)
+ if el['type'] == 'assert_behind':
+ evaluated = ''.join(map(lambda e: self.evaluate(e), el['value']))
+ return '(?<={})'.format(evaluated)
+ if el['type'] == 'assert_not_ahead':
+ evaluated = ''.join(map(lambda e: self.evaluate(e), el['value']))
+ return '(?!{})'.format(evaluated)
+ if el['type'] == 'assert_not_behind':
+ evaluated = ''.join(map(lambda e: self.evaluate(e), el['value']))
+ return '(?<!{})'.format(evaluated)
+ if el['type'] == 'any_of':
+ [fused, rest] = fuse_elements(el['value'])
+ if len(rest) == 0:
+ return '[{}]'.format(fused)
+ evaluated_rest = list(map(lambda e: self.evaluate(e), rest))
+ separator = '|' if len(evaluated_rest) > 0 and len(fused) > 0 else ''
+ return '(?:{}{}{})'.format('|'.join(evaluated_rest), separator, '[{}]'.format(fused) if fused else '')
+ if el['type'] == 'capture':
+ evaluated = ''.join(map(lambda e: self.evaluate(e), el['value']))
+ return evaluated
+ if el['type'] == 'named_capture':
+ evaluated = ''.join(map(lambda e: self.evaluate(e), el['value']))
+ return '(?P<{}>{})'.format(el['name'], evaluated)
+ if el['type'] == 'group':
+ evaluated = ''.join(map(lambda e: self.evaluate(e), el['value']))
+ return '(?:{})'.format(evaluated)
+
+ raise Exception('Can not process unsupported element type: {}'.format(el['type']))
+
+
+ def get_regex_patterns_and_flags(self):
+ assertion(len(self.state['stack']) == 1, can_not_call_se(self.get_current_frame()['type']['type']))
+ pattern = "".join(list(map(lambda e: self.evaluate(e), self.get_current_element_array())))
+ flags = ""
+ for flag_name, enabled in self.state['flags'].items():
+ if enabled:
+ flags += flag_name
+ pattern = "(?:)" if pattern == "" else pattern
+ flags = "".join(sorted(flags))
+ return pattern, flags
+
+ def to_regex_string(self):
+ patterns, flags = self.get_regex_patterns_and_flags()
+ return '/{}/{}'.format(str(patterns), str(flags))
+
+ def to_regex(self):
+ patterns, flags = self.get_regex_patterns_and_flags()
+ flag = None
+ if flags != '':
+ for flag_name in flags:
+ if flag is None:
+ flag = getattr(re, flag_name)
+ else:
+ flag |= getattr(re, flag_name)
+ return re.compile(patterns, flag)
diff --git a/src/edify/builder/constants.py b/src/edify/builder/constants.py
deleted file mode 100644
index c1e7f68..0000000
--- a/src/edify/builder/constants.py
+++ /dev/null
@@ -1,10 +0,0 @@
-ANY = "."
-WHITESPACE = "\\s"
-NON_WHITESPACE = "\\S"
-DIGIT = "\\d"
-NON_DIGIT = "\\D"
-WORD = "\\w"
-NON_WORD = "\\W"
-NEWLINE = "\\n"
-TAB = "\\t"
-NULL = "\\0"
diff --git a/src/edify/builder/escaped.py b/src/edify/builder/escaped.py
deleted file mode 100644
index b081706..0000000
--- a/src/edify/builder/escaped.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .abc import Builder
-
-
-class Escaped(Builder):
- def __init__(self, target):
- self.target = target
-
- def build(self):
- return f"\\{self.target}"
diff --git a/src/edify/builder/helpers/core.py b/src/edify/builder/helpers/core.py
index 0788e11..1dea2a0 100644
--- a/src/edify/builder/helpers/core.py
+++ b/src/edify/builder/helpers/core.py
@@ -50,3 +50,27 @@ def apply_subexpression_defaults(expr):
assertion(type(out['ignore_flags']) == bool, 'ignore_flags must be a boolean')
assertion(type(out['ignore_start_and_end']) == bool, 'ignore_start_and_end must be a boolean')
return out
+
+
+def is_fusable(element):
+ return element['type'] == 'range' or element['type'] == 'char' or element['type'] == 'any_of_chars'
+
+
+def partition(pred, a):
+ result = [[], []]
+ for cur in a:
+ if pred(cur):
+ result[0].append(cur)
+ else:
+ result[1].append(cur)
+ return result
+
+
+def fuse_elements(elements):
+ [fusables, rest] = partition(is_fusable, elements)
+ def map_el(el):
+ if el['type'] == 'char' or el['type'] == 'any_of_chars':
+ return el['value']
+ return '{}-{}'.format(el['value'][0], el['value'][1])
+ fused = ''.join(map(map_el, fusables))
+ return [fused, rest]
diff --git a/src/edify/builder/helpers/t.py b/src/edify/builder/helpers/t.py
index eb08763..ab3304c 100644
--- a/src/edify/builder/helpers/t.py
+++ b/src/edify/builder/helpers/t.py
@@ -29,7 +29,7 @@ t = {
'named_back_reference': lambda name: deferred_type('named_back_reference', {'name': name}),
'back_reference': lambda index: deferred_type('back_reference', {'index': index}),
'capture': deferred_type('capture', {'contains_children': True}),
- 'sub_expression': as_type('sub_expression', {'contains_children': True, 'quantifiers_require_group': True}),
+ 'subexpression': as_type('subexpression', {'contains_children': True, 'quantifiers_require_group': True}),
'named_capture': lambda name: deferred_type('named_capture', {'name': name, 'contains_children': True}),
'group': deferred_type('group', {'contains_children': True}),
'any_of': deferred_type('any_of', {'contains_children': True}),
diff --git a/src/edify/builder/quantifier.py b/src/edify/builder/quantifier.py
deleted file mode 100644
index 9cfc031..0000000
--- a/src/edify/builder/quantifier.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from .abc import Builder
-
-
-class Quantifier(Builder):
- def __init__(self, target):
- self.target = target
-
-
-class Optional(Quantifier):
- def build(self):
- return f"{self.target}?"
-
-
-class ZeroOrMore(Quantifier):
- def build(self):
- return f"{self.target}*"
-
-
-class OneOrMore(Quantifier):
- def build(self):
- return f"{self.target}+"
-
-
-class Exact(Quantifier):
- def __init__(self, target, count):
- super().__init__(target)
- self.count = count
-
- def build(self):
- return f"{self.target}{{{self.count}}}"
-
-
-class Range(Quantifier):
- def __init__(self, target, min, max):
- super().__init__(target)
- self.min = min
- self.max = max
-
- def build(self):
- return f"{self.target}{{{self.min},{self.max}}}"
-
-
-class AtLeast(Quantifier):
- def __init__(self, target, min):
- super().__init__(target)
- self.min = min
-
- def build(self):
- return f"{self.target}{{{self.min},}}"
-
-
-class AtMost(Quantifier):
- def __init__(self, target, max):
- super().__init__(target)
- self.max = max
-
- def build(self):
- return f"{self.target}{{,{self.max}}}"