From 9740e3756b7dc91808295341a02274350fb6c05d Mon Sep 17 00:00:00 2001 From: Bobby <30593201+luciferreeves@users.noreply.github.com> Date: Thu, 1 Sep 2022 17:49:26 -0400 Subject: Creating a deep copy at a different pointer location for each RegexBuilder class subroutine call --- src/edify/builder/builder.py | 207 ++++++++++++++++++++++++++++++-------- src/edify/builder/errors.py | 21 ++++ src/edify/builder/helpers/core.py | 19 ++++ 3 files changed, 206 insertions(+), 41 deletions(-) (limited to 'src') diff --git a/src/edify/builder/builder.py b/src/edify/builder/builder.py index a38877a..6213bb6 100644 --- a/src/edify/builder/builder.py +++ b/src/edify/builder/builder.py @@ -1,20 +1,28 @@ +from copy import deepcopy as clone import re +from .errors import can_not_call_se from .errors import can_not_end_while_building_root_exp from .errors import cannot_create_duplicate_named_group from .errors import cannot_define_start_after_end from .errors import end_input_already_defined +from .errors import ignore_se from .errors import invalid_total_capture_groups_index from .errors import must_be_a_string +from .errors import must_be_instance from .errors import must_be_integer_greater_than_zero from .errors import must_be_one_character from .errors import must_be_positive_integer +from .errors import must_be_single_character +from .errors import must_have_a_smaller_value from .errors import name_not_valid from .errors import named_group_does_not_exist from .errors import start_input_already_defined from .errors import unable_to_quantify +from .helpers.core import apply_subexpression_defaults from .helpers.core import assertion from .helpers.core import create_stack_frame +from .helpers.core import deep_copy from .helpers.core import escape_special from .helpers.regex_vars import named_group_regex from .helpers.t import t @@ -44,22 +52,34 @@ class RegexBuilder: } def allow_multiple_matches(self): - self.state['stack']['flags']['g'] = True + next = clone(self) + next.state['stack']['flags']['g'] = True + return next def sticky(self): - self.state['stack']['flags']['y'] = True + next = clone(self) + next.state['stack']['flags']['y'] = True + return next def line_by_line(self): - self.state['stack']['flags']['m'] = True + next = clone(self) + next.state['stack']['flags']['m'] = True + return next def case_insensitive(self): - self.state['stack']['flags']['i'] = True + next = clone(self) + next.state['stack']['flags']['i'] = True + return next def unicode(self): - self.state['stack']['flags']['u'] = True + next = clone(self) + next.state['stack']['flags']['u'] = True + return next def single_line(self): - self.state['stack']['flags']['s'] = True + next = clone(self) + next.state['stack']['flags']['s'] = True + return next def get_current_frame(self): return self.state['stack'] @@ -68,9 +88,9 @@ class RegexBuilder: return self.get_current_frame()['elements'] def match_element(self, type_fn): - current_element_array = self.get_current_element_array() - current_element_array.append(self.apply_quantifier(type_fn)) - return self + next = clone(self) + next.get_current_element_array().append(next.apply_quantifier(type_fn)) + return next def apply_quantifier(self, element): current_frame = self.get_current_frame() @@ -81,9 +101,10 @@ class RegexBuilder: return element def frame_creating_element(self, type_fn): + next = clone(self) new_frame = create_stack_frame(type_fn) - self.state['stack'] = new_frame - return self + next.state['stack'].append(new_frame) + return next def tracked_named_group(self, name): assertion(type(name) is str, must_be_a_string("Name", type(name))) @@ -93,24 +114,27 @@ class RegexBuilder: self.state['named_groups'].append(name) def capture(self): + next = clone(self) new_frame = create_stack_frame(t['capture']) - self.state['stack'] = new_frame - self.state['total_capture_groups'] += 1 - return self + next.state['stack'].append(new_frame) + next.state['total_capture_groups'] += 1 + return next def named_capture(self, name): + next = clone(self) new_frame = create_stack_frame(t['named_capture'](name)) - self.tracked_named_group(name) - self.state['stack'] = new_frame - self.state['total_capture_groups'] += 1 - return self + next.tracked_named_group(name) + next.state['stack'].append(new_frame) + next.state['total_capture_groups'] += 1 + return next def quantifier_element(self, type_fn): - current_frame = self.get_current_frame() + next = clone(self) + current_frame = next.get_current_frame() if current_frame['quantifier'] is not None: raise Exception(unable_to_quantify(type_fn, current_frame['quantifier']['type'])) current_frame['quantifier'] = t[type_fn] - return self + return next def any_char(self): return self.match_element(t['any_char']) @@ -206,56 +230,157 @@ class RegexBuilder: def at_least(self, count): assertion(type(count) is int and count > 0, must_be_positive_integer('count')) - current_frame = self.get_current_frame() + next = clone(self) + current_frame = next.get_current_frame() if current_frame['quantifier'] is not None: raise Exception(unable_to_quantify("at_least", current_frame['quantifier']['type'])) current_frame['quantifier'] = t['at_least'](count) - return self + return next def between(self, x, y): assertion(type(x) is int and x >= 0, must_be_integer_greater_than_zero('x')) assertion(type(y) is int and y > 0, must_be_positive_integer('y')) assertion(x < y, 'X must be less than Y.') - current_frame = self.get_current_frame() + next = clone(self) + current_frame = next.get_current_frame() if current_frame['quantifier'] is not None: raise Exception(unable_to_quantify("between", current_frame['quantifier']['type'])) current_frame['quantifier'] = t['between'](x, y) - return self + return next def between_lazy(self, x, y): assertion(type(x) is int and x >= 0, must_be_integer_greater_than_zero('x')) assertion(type(y) is int and y > 0, must_be_positive_integer('y')) assertion(x < y, 'X must be less than Y.') - current_frame = self.get_current_frame() + next = clone(self) + current_frame = next.get_current_frame() if current_frame['quantifier'] is not None: raise Exception(unable_to_quantify("between_lazy", current_frame['quantifier']['type'])) current_frame['quantifier'] = t['between_lazy'](x, y) - return self + return next def start_of_input(self): assertion(self.state['has_defined_start'] is False, start_input_already_defined()) assertion(self.state['has_defined_end'] is False, cannot_define_start_after_end()) - self.state['has_defined_start'] = True - current_element_array = self.get_current_element_array() - current_element_array.append(t['start_of_input']) - return self + next = clone(self) + next.state['has_defined_start'] = True + next.get_current_element_array().append(t['start_of_input']) + return next def end_of_input(self): assertion(self.state['has_defined_end'] is False, end_input_already_defined()) - self.state['has_defined_end'] = True - current_element_array = self.get_current_element_array() - current_element_array.append(t['end_of_input']) - return self + next = clone(self) + next.state['has_defined_end'] = True + next.get_current_element_array().append(t['end_of_input']) + return next def any_of_chars(self, chars): + next = clone(self) element_value = t['any_of_chars'](escape_special(chars)) - current_frame = self.get_current_frame() - current_frame['elements'].append(self.apply_quantifier(element_value)) - return self + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next def end(self): assertion(len(self.state['stack']) > 1, can_not_end_while_building_root_exp()) - old_frame = self.state['stack'].pop() - current_frame = self.get_current_frame() - current_frame['elements'].append(self.apply_quantifier(old_frame['type']['value'](old_frame['elements']))) - return self + next = clone(self) + old_frame = next.state['stack'].pop() + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(old_frame['type']['value'](old_frame['elements']))) + return next + + def anything_but_string(self, string): + assertion(type(string) is str, must_be_a_string('Value', string)) + assertion(len(string) > 0, must_be_one_character('Value')) + next = clone(self) + element_value = t['anything_but_string'](escape_special(string)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def anything_but_chars(self, chars): + assertion(type(chars) is str, must_be_a_string('Value', chars)) + assertion(len(chars) > 0, must_be_one_character('Value')) + next = clone(self) + element_value = t['anything_but_chars'](escape_special(chars)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def anything_but_range(self, a, b): + str_a = str(a) + str_b = str(b) + assertion(len(str_a) == 1, must_be_single_character('a', str_a)) + assertion(len(str_b) == 1, must_be_single_character('b', str_b)) + assertion(ord(str_a) < ord(str_b), must_have_a_smaller_value(str_a, str_b)) + next = clone(self) + element_value = t['anything_but_range']([a, b]) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def string(self, s): + assertion(type(s) is str, must_be_a_string('Value', s)) + assertion(len(s) > 0, must_be_one_character('Value')) + next = clone(self) + element_value = t['string'](escape_special(s)) if len(s) > 1 else t['char'](escape_special(s)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def char(self, c): + assertion(type(c) is str, must_be_a_string('Value', c)) + assertion(len(c) == 1, must_be_single_character('Value', c)) + next = clone(self) + element_value = t['char'](escape_special(c)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def range(self, a, b): + str_a = str(a) + str_b = str(b) + assertion(len(str_a) == 1, must_be_single_character('a', str_a)) + assertion(len(str_b) == 1, must_be_single_character('b', str_b)) + assertion(ord(str_a) < ord(str_b), must_have_a_smaller_value(str_a, str_b)) + next = clone(self) + element_value = t['range']([a, b]) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def merge_subexpression(self, el, options, parent, increment_capture_groups): + next_el = deep_copy(el) + + if next_el['type'] == 'back_reference': + next_el['index'] += parent['state']['total_capture_groups'] + if next_el['type'] == 'capture': + increment_capture_groups() + if next_el['type'] == 'named_capture': + group_name = '{}{}'.format(options['namespace'], next_el['name']) if options['namespace'] else next_el['name'] + parent['tracked_named_group'] = group_name + next_el['name'] = group_name + if next_el['type'] == 'named_back_reference': + next_el['name'] = '{}{}'.format(options['namespace'], next_el['name']) if options['namespace'] else next_el['name'] + if next_el['contains_child']: + next_el['value'] = self.merge_subexpression(next_el['value'], options, parent, increment_capture_groups) + elif next_el['contains_children']: + next_el['value'] = next_el['value'].map(lambda e: self.merge_subexpression(e, options, parent, increment_capture_groups)) + if next_el['type'] == 'start_of_input': + if options['ignore_start_and_end']: + return t['noop'] + assertion(parent['state']['has_defined_start'] is False, str(start_input_already_defined()) + str(ignore_se())) + assertion(parent['state']['has_defined_end'] is False, str(end_input_already_defined()) + str(ignore_se())) + parent['state']['has_defined_start'] = True + if next_el['type'] == 'end_of_input': + if options['ignore_start_and_end']: + return t['noop'] + assertion(parent['state']['has_defined_end'] is False, str(end_input_already_defined()) + str(ignore_se())) + parent['state']['has_defined_end'] = True + return next_el + + def subexpression(self, expr, opts={}): + assertion(isinstance(expr, RegexBuilder), must_be_instance("Expression", expr, "RegexBuilder")) + assertion(len(expr['state']['stack']) == 1, can_not_call_se(expr.get_current_frame()['type']['type'])) + options = apply_subexpression_defaults(opts) + diff --git a/src/edify/builder/errors.py b/src/edify/builder/errors.py index c9fe99a..741d047 100644 --- a/src/edify/builder/errors.py +++ b/src/edify/builder/errors.py @@ -48,3 +48,24 @@ def end_input_already_defined(): def can_not_end_while_building_root_exp(): return 'Can not end while building the root expression.' + + +def must_be_single_character(value, variable_name): + return '{} must be a single character. (got {})'.format(value, type(variable_name)) + + +def must_have_a_smaller_value(a, b): + return '{} must have a smaller character value than {}. (a = {}, b = {})'.format(a, b, ord(a), ord(b)) + + +def ignore_se(): + return 'You can ignore a subexpressions startOfInput/endOfInput markers with the ignoreStartAndEnd option' + + +def must_be_instance(value, variable_name, class_name): + return '{} must be an instance of {}. (got {})'.format(value, class_name, type(variable_name)) + + +def can_not_call_se(cft): + return "Can not call subexpression a not yet fully specified regex object. \ + \n (Try adding a .end() call to match the {} on the subexpression)".format(cft) diff --git a/src/edify/builder/helpers/core.py b/src/edify/builder/helpers/core.py index 15cf9ef..0788e11 100644 --- a/src/edify/builder/helpers/core.py +++ b/src/edify/builder/helpers/core.py @@ -31,3 +31,22 @@ def assertion(condition, message): def escape_special(s): return re.escape(s) + + +def deep_copy(o): + if isinstance(o, list): + return [deep_copy(e) for e in o] + if isinstance(o, dict): + return {k: deep_copy(v) for k, v in o.items()} + return o + + +def apply_subexpression_defaults(expr): + out = {**expr} + out['namespace'] = "" if 'namespace' not in out else out['namespace'] + out['ignore_flags'] = True if 'ignore_flags' not in out else out['ignore_flags'] + out['ignore_start_and_end'] = True if 'ignore_start_and_end' not in out else out['ignore_start_and_end'] + assertion(type(out['namespace']) == str, 'namespace must be a string') + assertion(type(out['ignore_flags']) == bool, 'ignore_flags must be a boolean') + assertion(type(out['ignore_start_and_end']) == bool, 'ignore_start_and_end must be a boolean') + return out -- cgit v1.2.3