diff options
| author | Bobby <[email protected]> | 2022-09-01 19:51:18 -0400 |
|---|---|---|
| committer | Bobby <[email protected]> | 2022-09-01 19:51:18 -0400 |
| commit | 98024bb46f3f441e11f7d531837ca765309817cf (patch) | |
| tree | e538c7ae20a12fb42bf4ae53f85d3d84f861f23e /src | |
| parent | 9740e3756b7dc91808295341a02274350fb6c05d (diff) | |
| download | edify-98024bb46f3f441e11f7d531837ca765309817cf.tar.xz edify-98024bb46f3f441e11f7d531837ca765309817cf.zip | |
Regex Builder class
Diffstat (limited to 'src')
| -rw-r--r-- | src/edify/builder/abc.py | 6 | ||||
| -rw-r--r-- | src/edify/builder/builder.py | 186 | ||||
| -rw-r--r-- | src/edify/builder/constants.py | 10 | ||||
| -rw-r--r-- | src/edify/builder/escaped.py | 9 | ||||
| -rw-r--r-- | src/edify/builder/helpers/core.py | 24 | ||||
| -rw-r--r-- | src/edify/builder/helpers/t.py | 2 | ||||
| -rw-r--r-- | src/edify/builder/quantifier.py | 58 |
7 files changed, 191 insertions, 104 deletions
diff --git a/src/edify/builder/abc.py b/src/edify/builder/abc.py deleted file mode 100644 index 4d6b088..0000000 --- a/src/edify/builder/abc.py +++ /dev/null @@ -1,6 +0,0 @@ -from abc import ABC - - -class Builder(ABC): - def __str__(self): - return self.build() diff --git a/src/edify/builder/builder.py b/src/edify/builder/builder.py index 6213bb6..0914920 100644 --- a/src/edify/builder/builder.py +++ b/src/edify/builder/builder.py @@ -23,9 +23,10 @@ from .helpers.core import apply_subexpression_defaults from .helpers.core import assertion from .helpers.core import create_stack_frame from .helpers.core import deep_copy -from .helpers.core import escape_special +from .helpers.core import escape_special, fuse_elements from .helpers.regex_vars import named_group_regex from .helpers.t import t +from .helpers.quantifiers import quantifier_table class RegexBuilder: @@ -39,46 +40,52 @@ class RegexBuilder: 'has_defined_start': False, 'has_defined_end': False, 'flags': { - 'g': False, - 'y': False, - 'm': False, - 'i': False, - 'u': False, - 's': False, + 'A': False, + 'DEBUG': False, + 'L': False, + 'I': False, + 'M': False, + 'S': False, + 'V': False, }, 'stack': create_stack_frame(t['root']), 'named_groups': [], 'total_capture_groups': 0, } - def allow_multiple_matches(self): + def ascii_only(self): next = clone(self) - next.state['stack']['flags']['g'] = True + next.state['stack']['flags']['A'] = True return next - def sticky(self): + def debug(self): next = clone(self) - next.state['stack']['flags']['y'] = True + next.state['stack']['flags']['DEBUG'] = True return next - def line_by_line(self): + def locale(self): next = clone(self) - next.state['stack']['flags']['m'] = True + next.state['stack']['flags']['L'] = True return next - def case_insensitive(self): + def ignore_case(self): next = clone(self) - next.state['stack']['flags']['i'] = True + next.state['stack']['flags']['I'] = True return next - def unicode(self): + def multi_line(self): next = clone(self) - next.state['stack']['flags']['u'] = True + next.state['stack']['flags']['M'] = True return next - def single_line(self): + def dot_all(self): next = clone(self) - next.state['stack']['flags']['s'] = True + next.state['stack']['flags']['S'] = True + return next + + def verbose(self): + next = clone(self) + next.state['stack']['flags']['V'] = True return next def get_current_frame(self): @@ -365,7 +372,7 @@ class RegexBuilder: if next_el['contains_child']: next_el['value'] = self.merge_subexpression(next_el['value'], options, parent, increment_capture_groups) elif next_el['contains_children']: - next_el['value'] = next_el['value'].map(lambda e: self.merge_subexpression(e, options, parent, increment_capture_groups)) + next_el['value'] = list(map(lambda e: self.merge_subexpression(e, options, parent, increment_capture_groups), next_el['value'])) if next_el['type'] == 'start_of_input': if options['ignore_start_and_end']: return t['noop'] @@ -383,4 +390,143 @@ class RegexBuilder: assertion(isinstance(expr, RegexBuilder), must_be_instance("Expression", expr, "RegexBuilder")) assertion(len(expr['state']['stack']) == 1, can_not_call_se(expr.get_current_frame()['type']['type'])) options = apply_subexpression_defaults(opts) + expr_next = clone(expr) + next = clone(self) + additional_capture_groups = 0 + expr_frame = expr_next.get_current_frame() + + def increment_capture_groups(): + nonlocal additional_capture_groups + additional_capture_groups += 1 + expr_frame['elements'] = list(map( + lambda e: self.merge_subexpression(e, options, expr_next, increment_capture_groups), expr_frame['elements'])) + next.state['total_capture_groups'] += additional_capture_groups + if not options['ignore_flags']: + for flag_name, enabled in expr_next.state['flags'].items(): + next.state['flags'][flag_name] = enabled or next.state['flags'][flag_name] + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(t['subexpression'](expr_frame['elements']))) + return next + def evaluate(self, el): + if el['type'] == 'noop': + return '' + if el['type'] == 'any_char': + return '.' + if el['type'] == 'whitespace_char': + return '\s' + if el['type'] == 'non_whitespace_char': + return '\S' + if el['type'] == 'digit': + return '\d' + if el['type'] == 'non_digit': + return '\D' + if el['type'] == 'word': + return '\w' + if el['type'] == 'non_word': + return '\W' + if el['type'] == 'word_boundary': + return '\\b' + if el['type'] == 'non_word_boundary': + return '\B' + if el['type'] == 'start_of_input': + return '^' + if el['type'] == 'end_of_input': + return '$' + if el['type'] == 'newline': + return '\\n' + if el['type'] == 'carriage_return': + return '\\r' + if el['type'] == 'tab': + return '\\t' + if el['type'] == 'null_byte': + return '\\0' + if el['type'] == 'string': + return el['value'] + if el['type'] == 'char': + return el['value'] + if el['type'] == 'range': + return '[{}-{}]'.format(el['value'][0], el['value'][1]) + if el['type'] == 'anything_but_range': + return '[^{}-{}]'.format(el['value'][0], el['value'][1]) + if el['type'] == 'any_of_chars': + return '[' + ''.join(el['value']) + ']' + if el['type'] == 'anything_but_chars': + return '[^' + ''.join(el['value']) + ']' + if el['type'] == 'named_back_reference': + return '\\k<{}>'.format(el['name']) + if el['type'] == 'back_reference': + return '\\{}'.format(el['index']) + if el['type'] == 'subexpression': + return ''.join(map(lambda e: self.evaluate(e), el['value'])) + cg1 = ['optional', 'zero_or_more', 'zero_or_more_lazy', 'one_or_more', 'one_or_more_lazy'] + if el['type'] in cg1: + inner = self.evaluate(el['value']) + with_group = "(?:{})".format(inner) if el['value']['quantifiers_require_group'] else inner + symbol = quantifier_table[el['type']] + return '{}{}'.format(with_group, symbol) + cg2 = ['between', 'between_lazy', 'at_least', 'exactly'] + if el['type'] in cg2: + inner = self.evaluate(el['value']) + with_group = "(?:{})".format(inner) if el['value']['quantifiers_require_group'] else inner + return '{}{}'.format(with_group, quantifier_table[el['type']](el['times'])) + if el['type'] == 'anything_but_string': + chars = ''.join(map(lambda c: '[^{}]'.format(c), el['value'])) + return '(?:{})'.format(chars) + if el['type'] == 'assert_ahead': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?={})'.format(evaluated) + if el['type'] == 'assert_behind': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?<={})'.format(evaluated) + if el['type'] == 'assert_not_ahead': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?!{})'.format(evaluated) + if el['type'] == 'assert_not_behind': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?<!{})'.format(evaluated) + if el['type'] == 'any_of': + [fused, rest] = fuse_elements(el['value']) + if len(rest) == 0: + return '[{}]'.format(fused) + evaluated_rest = list(map(lambda e: self.evaluate(e), rest)) + separator = '|' if len(evaluated_rest) > 0 and len(fused) > 0 else '' + return '(?:{}{}{})'.format('|'.join(evaluated_rest), separator, '[{}]'.format(fused) if fused else '') + if el['type'] == 'capture': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return evaluated + if el['type'] == 'named_capture': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?P<{}>{})'.format(el['name'], evaluated) + if el['type'] == 'group': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?:{})'.format(evaluated) + + raise Exception('Can not process unsupported element type: {}'.format(el['type'])) + + + def get_regex_patterns_and_flags(self): + assertion(len(self.state['stack']) == 1, can_not_call_se(self.get_current_frame()['type']['type'])) + pattern = "".join(list(map(lambda e: self.evaluate(e), self.get_current_element_array()))) + flags = "" + for flag_name, enabled in self.state['flags'].items(): + if enabled: + flags += flag_name + pattern = "(?:)" if pattern == "" else pattern + flags = "".join(sorted(flags)) + return pattern, flags + + def to_regex_string(self): + patterns, flags = self.get_regex_patterns_and_flags() + return '/{}/{}'.format(str(patterns), str(flags)) + + def to_regex(self): + patterns, flags = self.get_regex_patterns_and_flags() + flag = None + if flags != '': + for flag_name in flags: + if flag is None: + flag = getattr(re, flag_name) + else: + flag |= getattr(re, flag_name) + return re.compile(patterns, flag) diff --git a/src/edify/builder/constants.py b/src/edify/builder/constants.py deleted file mode 100644 index c1e7f68..0000000 --- a/src/edify/builder/constants.py +++ /dev/null @@ -1,10 +0,0 @@ -ANY = "." -WHITESPACE = "\\s" -NON_WHITESPACE = "\\S" -DIGIT = "\\d" -NON_DIGIT = "\\D" -WORD = "\\w" -NON_WORD = "\\W" -NEWLINE = "\\n" -TAB = "\\t" -NULL = "\\0" diff --git a/src/edify/builder/escaped.py b/src/edify/builder/escaped.py deleted file mode 100644 index b081706..0000000 --- a/src/edify/builder/escaped.py +++ /dev/null @@ -1,9 +0,0 @@ -from .abc import Builder - - -class Escaped(Builder): - def __init__(self, target): - self.target = target - - def build(self): - return f"\\{self.target}" diff --git a/src/edify/builder/helpers/core.py b/src/edify/builder/helpers/core.py index 0788e11..1dea2a0 100644 --- a/src/edify/builder/helpers/core.py +++ b/src/edify/builder/helpers/core.py @@ -50,3 +50,27 @@ def apply_subexpression_defaults(expr): assertion(type(out['ignore_flags']) == bool, 'ignore_flags must be a boolean') assertion(type(out['ignore_start_and_end']) == bool, 'ignore_start_and_end must be a boolean') return out + + +def is_fusable(element): + return element['type'] == 'range' or element['type'] == 'char' or element['type'] == 'any_of_chars' + + +def partition(pred, a): + result = [[], []] + for cur in a: + if pred(cur): + result[0].append(cur) + else: + result[1].append(cur) + return result + + +def fuse_elements(elements): + [fusables, rest] = partition(is_fusable, elements) + def map_el(el): + if el['type'] == 'char' or el['type'] == 'any_of_chars': + return el['value'] + return '{}-{}'.format(el['value'][0], el['value'][1]) + fused = ''.join(map(map_el, fusables)) + return [fused, rest] diff --git a/src/edify/builder/helpers/t.py b/src/edify/builder/helpers/t.py index eb08763..ab3304c 100644 --- a/src/edify/builder/helpers/t.py +++ b/src/edify/builder/helpers/t.py @@ -29,7 +29,7 @@ t = { 'named_back_reference': lambda name: deferred_type('named_back_reference', {'name': name}), 'back_reference': lambda index: deferred_type('back_reference', {'index': index}), 'capture': deferred_type('capture', {'contains_children': True}), - 'sub_expression': as_type('sub_expression', {'contains_children': True, 'quantifiers_require_group': True}), + 'subexpression': as_type('subexpression', {'contains_children': True, 'quantifiers_require_group': True}), 'named_capture': lambda name: deferred_type('named_capture', {'name': name, 'contains_children': True}), 'group': deferred_type('group', {'contains_children': True}), 'any_of': deferred_type('any_of', {'contains_children': True}), diff --git a/src/edify/builder/quantifier.py b/src/edify/builder/quantifier.py deleted file mode 100644 index 9cfc031..0000000 --- a/src/edify/builder/quantifier.py +++ /dev/null @@ -1,58 +0,0 @@ -from .abc import Builder - - -class Quantifier(Builder): - def __init__(self, target): - self.target = target - - -class Optional(Quantifier): - def build(self): - return f"{self.target}?" - - -class ZeroOrMore(Quantifier): - def build(self): - return f"{self.target}*" - - -class OneOrMore(Quantifier): - def build(self): - return f"{self.target}+" - - -class Exact(Quantifier): - def __init__(self, target, count): - super().__init__(target) - self.count = count - - def build(self): - return f"{self.target}{{{self.count}}}" - - -class Range(Quantifier): - def __init__(self, target, min, max): - super().__init__(target) - self.min = min - self.max = max - - def build(self): - return f"{self.target}{{{self.min},{self.max}}}" - - -class AtLeast(Quantifier): - def __init__(self, target, min): - super().__init__(target) - self.min = min - - def build(self): - return f"{self.target}{{{self.min},}}" - - -class AtMost(Quantifier): - def __init__(self, target, max): - super().__init__(target) - self.max = max - - def build(self): - return f"{self.target}{{,{self.max}}}" |
