diff options
| author | Bobby <[email protected]> | 2022-09-03 13:01:40 -0400 |
|---|---|---|
| committer | GitHub <[email protected]> | 2022-09-03 13:01:40 -0400 |
| commit | 456fe5da8bbd38a878790977be0425c5a96aba82 (patch) | |
| tree | 3b615b16a08fa8f3e7499062cf53b9e96f1da0b8 | |
| parent | c907f47a7bdae7fe6ca2c35e82c4177c2c0e685f (diff) | |
| parent | 9d9d4b2d49111e2e5541eef6eda4cb88c52f4342 (diff) | |
| download | edify-456fe5da8bbd38a878790977be0425c5a96aba82.tar.xz edify-456fe5da8bbd38a878790977be0425c5a96aba82.zip | |
Added `RegexBuilder` Class and Documentation (#3)
# Changes
- Added local test runner script `test.local.sh`
- Added support for `edify.library` and `email` and `phone` validation functions
- Added `RegexBuilder` class
- Added Necessary Tests for Code Coverage
- Added Support for flags
48 files changed, 2626 insertions, 106 deletions
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 0000000..921fe4c --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,72 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "main", "dev" ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ "main", "dev" ] + schedule: + - cron: '0 0 * * *' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 88ad0fc..fa53eb5 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -19,24 +19,6 @@ jobs: toxpython: 'python3.9' tox_env: 'docs' os: 'ubuntu-latest' - - name: 'py36 (ubuntu)' - python: '3.6' - toxpython: 'python3.6' - python_arch: 'x64' - tox_env: 'py36' - os: 'ubuntu-latest' - - name: 'py36 (windows)' - python: '3.6' - toxpython: 'python3.6' - python_arch: 'x64' - tox_env: 'py36' - os: 'windows-latest' - - name: 'py36 (macos)' - python: '3.6' - toxpython: 'python3.6' - python_arch: 'x64' - tox_env: 'py36' - os: 'macos-latest' - name: 'py37 (ubuntu)' python: '3.7' toxpython: 'python3.7' diff --git a/MANIFEST.in b/MANIFEST.in index d0dac9c..a7518d1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -18,5 +18,7 @@ include CHANGELOG.rst include CONTRIBUTING.rst include LICENSE include README.rst +include *.sh +recursive-include images *.png global-exclude *.py[cod] __pycache__/* *.so *.dylib @@ -1,33 +1,22 @@ ======== -Overview +Edify ======== -.. start-badges +.. Cover Image +.. image:: https://raw.githubusercontent.com/luciferreeves/edify/dev/images/cover.png + :alt: Cover Image -.. list-table:: - :stub-columns: 1 +| - * - docs - - |docs| - * - tests - - | |github-actions| |requires| - | |codecov| - * - package - - | |version| |wheel| |supported-versions| |supported-implementations| - | |commits-since| -.. |docs| image:: https://readthedocs.org/projects/edify/badge/?style=flat +.. image:: https://readthedocs.org/projects/edify/badge/?style=flat&version=stable :target: https://edify.readthedocs.io/ :alt: Documentation Status -.. |github-actions| image:: https://github.com/luciferreeves/edify/actions/workflows/github-actions.yml/badge.svg +.. image:: https://github.com/luciferreeves/edify/actions/workflows/github-actions.yml/badge.svg?branch=main :alt: GitHub Actions Build Status :target: https://github.com/luciferreeves/edify/actions -.. .. |requires| image:: https://requires.io/github/luciferreeves/edify/requirements.svg?branch=main -.. :alt: Requirements Status -.. :target: https://requires.io/github/luciferreeves/edify/requirements/?branch=main - -.. |codecov| image:: https://codecov.io/gh/luciferreeves/edify/branch/main/graphs/badge.svg?branch=main +.. image:: https://codecov.io/gh/luciferreeves/edify/branch/main/graphs/badge.svg?branch=main :alt: Coverage Status :target: https://codecov.io/github/luciferreeves/edify @@ -47,17 +36,19 @@ Overview .. :alt: Supported implementations .. :target: https://pypi.org/project/edify -.. |commits-since| image:: https://img.shields.io/github/commits-since/luciferreeves/edify/v0.1.0.svg - :alt: Commits since latest release - :target: https://github.com/luciferreeves/edify/compare/v0.1.0...main +.. .. |commits-since| image:: https://img.shields.io/github/commits-since/luciferreeves/edify/v0.1.0.svg +.. :alt: Commits since latest release +.. :target: https://github.com/luciferreeves/edify/compare/v0.1.0...main .. end-badges -Regular Expressions Made Simple +| + +Edify (/ˈɛdɪfaɪ/, "ed-uh-fahy") is a Python library that allows you to easily create regular expressions for matching text in a programmatically-friendly way. It is designed to be used in conjunction with the ``re`` module. -* Free software: Apache Software License 2.0 +It also allows you to verify a string quickly by providing commonly used regex patterns in its extensive set of built-in patterns. To tap into a pattern, simply import the pattern function from the ``edify.library`` module. Installation ============ @@ -71,33 +62,81 @@ You can also install the in-development version with:: pip install https://github.com/luciferreeves/edify/archive/main.zip -Documentation +Why Edify? +=========== + +Regex is a powerful tool, but its syntax is not very intuitive and can be difficult to build, understand, and use. It gets even more difficult when you have to deal with backtracking, look-ahead, and other features that make regex difficult. + +That's where Edify becomes extremely useful. It allows you to create regular expressions in a programmatic way by invoking the ``RegexBuilder`` class, based on the SuperExpressive_ library. The API uses the `fluent builder pattern <https://en.wikipedia.org/wiki/Fluent_interface>`_, and is completely immutable. It is built to be discoverable and predictable. + +- Properties and methods describe what they do in plain English. +- Order matters! Quantifiers are specified before the thing they change, just like in English (e.g. ``RegexBuilder().exactly(5).digit()``). +- If you make a mistake, you'll know how to fix it. Edify will guide you towards a fix if your expression is invalid. +- ``subexpressions`` can be used to create meaningful, reusable components. + +Edify turns those complex and unwieldy regexes that appear in code reviews into something that can be read, understood, and **properly reviewed** by your peers - and maintained by anyone! + + +.. _SuperExpressive: https://github.com/francisrstokes/super-expressive + +Quick Start ============= +To get started make sure you have python 3.7 or later installed and then, install Edify from ``pip``:: -https://edify.readthedocs.io/ + pip install edify +Then go on to import the ``RegexBuilder`` class from the ``edify`` module. -Development -=========== +Using Pre-Built Patterns +------------------------ -To run all the tests run:: +The following example recognises and captures any email like ``[email protected]``. - tox +.. code-block:: python -Note, to combine the coverage data from all the tox environments run: -.. list-table:: - :widths: 10 90 - :stub-columns: 1 + from edify.library import email - - - Windows - - :: + email_addr = "[email protected]" + assert email(email_addr) == True - set PYTEST_ADDOPTS=--cov-append - tox - - - Other - - :: +Building Regex Example +---------------------- + +The following example recognises and captures the value of a 16-bit hexadecimal number like ``0xC0D3``. + +.. code-block:: python + + + from edify import RegexBuilder + + expr = ( + RegexBuilder() + .start_of_input() + .optional().string("0x") + .capture() + .exactly(4).any_of() + .range("A", "F") + .range("a", "f") + .range("0", "9") + .end() + .end() + .end_of_input() + .to_regex() + ) + + """ + Produces the following regular expression: + re.compile(^(?:0x)?([A-Fa-f0-9]{4})$) + """ + + assert expr.match("0xC0D3") + + +Documentation +============= + +Further API documentation is available on `edify.rftd.io <https://edify.readthedocs.io>`_. - PYTEST_ADDOPTS=--cov-append tox diff --git a/docs/index.rst b/docs/index.rst index ad842d5..831bf7d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,12 +3,10 @@ Contents ======== .. toctree:: - :maxdepth: 2 + :maxdepth: 3 readme - installation - usage - reference/index + regex-builder/index contributing authors changelog diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index 1fb9d10..0000000 --- a/docs/installation.rst +++ /dev/null @@ -1,7 +0,0 @@ -============ -Installation -============ - -At the command line:: - - pip install edify diff --git a/docs/reference/edify.rst b/docs/reference/edify.rst deleted file mode 100644 index 6374fd4..0000000 --- a/docs/reference/edify.rst +++ /dev/null @@ -1,9 +0,0 @@ -edify -===== - -.. testsetup:: - - from edify import * - -.. automodule:: edify - :members: diff --git a/docs/reference/index.rst b/docs/reference/index.rst deleted file mode 100644 index df7e04e..0000000 --- a/docs/reference/index.rst +++ /dev/null @@ -1,7 +0,0 @@ -Reference -========= - -.. toctree:: - :glob: - - edify* diff --git a/docs/regex-builder/builder/index.rst b/docs/regex-builder/builder/index.rst new file mode 100644 index 0000000..ebe2e3b --- /dev/null +++ b/docs/regex-builder/builder/index.rst @@ -0,0 +1,937 @@ +RegexBuilder +============ + +RegexBuilder is a class that helps you build regular expressions. It is based on the `SuperExpressive <https://github.com/francisrstokes/super-expressive>`_ library. The API uses the `fluent builder pattern <https://en.wikipedia.org/wiki/Fluent_interface>`_, and is completely immutable. It is built to be discoverable and predictable. + +- Properties and methods describe what they do in plain English. +- Order matters! Quantifiers are specified before the thing they change, just like in English (e.g. ``RegexBuilder().exactly(5).digit()``.) +- If you make a mistake, you'll know how to fix it. Edify will guide you towards a fix if your expression is invalid. +- ``subexpressions`` can be used to create meaningful, reusable components. + +.any_char() +----------- + +``.any_char()`` matches any single character. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('.') + expr = RegexBuilder().any_char().to_regex() + assert expr.match('a') # Matches + assert expr.match('hello') # Matches + + +.whitespace_char() +------------------ + +``.whitespace_char()`` matches any whitespace character, including the special whitespace characters: ``\r\n\t\f\v``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\s') + expr = RegexBuilder().whitespace_char().to_regex() + assert expr.match(' ') # Matches + assert expr.match('\n') # Matches + assert expr.match('\t') # Matches + assert expr.match('\r') # Matches + assert expr.match('\f') # Matches + assert expr.match('\v') # Matches + assert not expr.match('a') # Doesn't match + assert not expr.match('hello') # Doesn't match + + +.non_whitespace_char() +---------------------- + +``.non_whitespace_char()`` matches any non-whitespace character, excluding also the special whitespace characters: ``\r\n\t\f\v``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\S') + expr = RegexBuilder().non_whitespace_char().to_regex() + assert expr.match('a') # Matches + assert expr.match('hello') # Matches + assert not expr.match(' ') # Doesn't match + assert not expr.match('\n') # Doesn't match + assert not expr.match('\t') # Doesn't match + assert not expr.match('\r') # Doesn't match + assert not expr.match('\f') # Doesn't match + assert not expr.match('\v') # Doesn't match + assert not expr.match('\u00a0') # Doesn't match + assert not expr.match('\u2000') # Doesn't match + + +.digit() +-------- + +``.digit()`` matches any digit from ``0-9``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d') + expr = RegexBuilder().digit().to_regex() + assert expr.match('1') # Matches + assert expr.match('9') # Matches + assert not expr.match('a') # Doesn't match + assert not expr.match('\u00a0') # Doesn't match + + +.non_digit() +------------- + +``.non_digit()`` matches any non-digit. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\D') + expr = RegexBuilder().non_digit().to_regex() + assert expr.match('a') # Matches + assert expr.match('\u00a0') # Matches + assert not expr.match('1') # Doesn't match + assert not expr.match('9') # Doesn't match + +.. _word: + +.word() +------- + + +``.word()`` matches any alpha-numeric ``(a-z, A-Z, 0-9)`` characters, as well as ``_``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\w') + expr = RegexBuilder().word().to_regex() + assert expr.match('a') # Matches + assert expr.match('1') # Matches + assert expr.match('_') # Matches + assert expr.match('hello') # Matches + + +.non_word() +----------- + +``.non_word()`` matches any non-alpha-numeric ``(a-z, A-Z, 0-9)`` characters, excluding ``_`` as well. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\W') + expr = RegexBuilder().non_word().to_regex() + assert not expr.match('a') # Doesn't match + assert not expr.match('1') # Doesn't match + assert expr.match('\u00a0') # Matches + assert expr.match('\u2000') # Matches + assert not expr.match('_') # Doesn't match + assert not expr.match('hello') # Doesn't match + + +.word_boundary() +----------------- + +``.word_boundary()`` matches (without consuming any characters) immediately between a character matched by :ref:`word` and a character not matched by :ref:`word` (in either order). + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d\b') + expr = RegexBuilder().digit().word_boundary().to_regex() + + +.non_word_boundary() +-------------------- + +``.non_word_boundary()`` matches (without consuming any characters) at the position between two characters matched by :ref:`word`. + +.. code-block:: python + + + from edify import RegexBuilder + + # returns re.compile('\d\B') + expr = RegexBuilder().digit().non_word_boundary().to_regex() + +.new_line() +----------- + +``.new_line()`` matches the newline ``\n`` character. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\n') + expr = RegexBuilder().new_line().to_regex() + assert expr.match('\n') # Matches + assert not expr.match('a') # Doesn't match + assert not expr.match('hello') # Doesn't match + +.carriage_return() +------------------- + +``.carriage_return()`` matches the carriage return ``\r`` character. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\r') + expr = RegexBuilder().carriage_return().to_regex() + assert expr.match('\r') # Matches + assert not expr.match('a') # Doesn't match + assert not expr.match('hello') # Doesn't match + + +.tab() +------ + +``.tab()`` matches the tab ``\t`` character. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\t') + expr = RegexBuilder().tab().to_regex() + assert expr.match('\t') # Matches + assert not expr.match('a') # Doesn't match + assert not expr.match('hello') # Doesn't match + + +.null_byte() +------------ + +``.null_byte()`` matches the null byte ``\0`` character. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\0') + expr = RegexBuilder().null_byte().to_regex() + assert expr.match('\0') # Matches + assert not expr.match('a') # Doesn't match + assert not expr.match('hello') # Doesn't match + +.. _any_of: + +.any_of() +--------- + +``.any_of()`` matches a choice between specified elements. Needs to be finalised with :ref:`end`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?:hello|[a-f0-9])') + expr = ( + RegexBuilder() + .any_of() + .range('a', 'f') + .range('0', '9') + .string('hello') + .end() + .to_regex() + ) + assert expr.match('a') # Matches + assert expr.match('f') # Matches + assert expr.match('9') # Matches + assert expr.match('hello') # Matches + assert not expr.match('g') # Doesn't match + assert not expr.match('good world') # Doesn't match + +.. _capture: + +.capture() +----------- + +``.capture()`` creates a capture group for the proceeding elements. Needs to be finalised with :ref:`end`. Can be later referenced with :ref:`backreference`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('([a-f][0-9]hello)') + expr = ( + RegexBuilder() + .capture() + .range('a', 'f') + .range('0', '9') + .string('hello') + .end() + .to_regex() + ) + assert expr.match('a9hello') # Matches + assert expr.match('f0hello') # Matches + assert not expr.match('g9hello') # Doesn't match + +.. _named_capture: + +.named_capture(name) +-------------------- + +``.named_capture()`` creates a named capture group for the proceeding elements. Needs to be finalised with :ref:`end`. Can be later referenced with :ref:`named_back_reference` or :ref:`backreference`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?P<interestingStuff>[a-f][0-9]hello)') + expr = ( + RegexBuilder() + .named_capture('interestingStuff') + .range('a', 'f') + .range('0', '9') + .string('hello') + .end() + .to_regex() + ) + assert expr.match('a9hello') # Matches + assert expr.match('f0hello') # Matches + assert not expr.match('g9hello') # Doesn't match + +.. _named_back_reference: + +.named_back_reference(name) +--------------------------- + +``.named_back_reference()`` matches exactly what was previously matched by a :ref:`named_capture`. + +.. warning:: + + Python does not support named back references. If you try to call the ``to_regex()`` method on a named back reference, it will raise an exception. For, those reasons, ``to_regex_string()`` is provided instead. It returns a string that can be used to create a regular expression. You can try using the regular expression directly with another library like `regex <https://pypi.python.org/pypi/regex>`_. + +.. code-block:: python + + from edify import RegexBuilder + + # returns /(?<interestingStuff>[a-f][0-9]hello)something else\k<interestingStuff>/ + expr = ( + RegexBuilder() + .named_capture('interestingStuff') + .range('a', 'f') + .range('0', '9') + .string('hello') + .end() + .string('something else') + .named_back_reference('interestingStuff') + .to_regex_string() + ) + +.. _backreference: + +.back_reference(index) +---------------------- + +``.back_reference()`` matches exactly what was previously matched by a :ref:`capture` or :ref:`named_capture` using a positional index. Note regex indexes start at 1, so the first capture group has index 1. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('([a-f][0-9]hello)\\1') + expr = ( + RegexBuilder() + .capture() + .range('a', 'f') + .range('0', '9') + .string('hello') + .end() + .back_reference(1) + .to_regex() + ) + assert expr.match('a9helloa9hello') # Matches + assert not expr.match('a9helloa9hell') # Doesn't match + +.. _group: + +.group() +-------- + +``.group()`` creates a non-capturing group for the proceeding elements. Needs to be finalised with :ref:`end`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?:[a-f][0-9]hello)?') + expr = ( + RegexBuilder() + .optional().group() + .range('a', 'f') + .range('0', '9') + .string('hello') + .end() + .to_regex() + ) + assert expr.match('a9hello') # Matches + assert expr.match('') # Matches + assert not expr.match('g9hello') # Matches + +.. _end: + +.end() +------ + +``.end()`` signifies the end of a ``RegexBuilder`` grouping, such as :ref:`capture`, :ref:`group` or :ref:`any_of` element. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('((?:hello|[a-f0-9]))') + expr = ( + RegexBuilder() + .capture() + .any_of() + .range('a', 'f') + .range('0', '9') + .string('hello') + .end() + .end() + .to_regex() + ) + +.. _assert_ahead: + +.assert_ahead() +--------------- + +``.assert_ahead()`` asserts that the proceeding elements are found without consuming them. Needs to be finalised with :ref:`end`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?=[a-f])[a-z]') + expr = ( + RegexBuilder() + .assert_ahead() + .range('a', 'f') + .end() + .range('a', 'z') + .to_regex() + ) + assert expr.match('a') # Matches + assert expr.match('f') # Matches + assert not expr.match('g') # Doesn't match + +.. _assert_not_ahead: + +.assert_not_ahead() +------------------- + +``.assert_not_ahead()`` asserts that the proceeding elements are not found without consuming them. Needs to be finalised with :ref:`end`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?![a-f])[g-z]') + expr = ( + RegexBuilder() + .assert_not_ahead() + .range('a', 'f') + .end() + .range('g', 'z') + .to_regex() + ) + assert expr.match('g') # Matches + assert expr.match('z') # Matches + assert not expr.match('a') # Doesn't match + +.. _assert_behind: + +.assert_behind() +---------------- + +``.assert_behind()`` asserts that the elements contained within are found immediately before this point in the string. Needs to be finalised with :ref:`end`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?<=hello )world') + expr = ( + RegexBuilder() + .assert_behind() + .string('hello ') + .end() + .string('world') + .to_regex() + ) + +.. _assert_not_behind: + +.assert_not_behind() +-------------------- + +``.assert_not_behind()`` asserts that the elements contained within are not found immediately before this point in the string. Needs to be finalised with :ref:`end`. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?<!hello )world') + expr = ( + RegexBuilder() + .assert_not_behind() + .string('hello ') + .end() + .string('world') + .to_regex() + ) + +.. _optional: + +.optional() +----------- + +``.optional()`` asserts that the proceeding element may or may not be matched. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d?') + expr = ( + RegexBuilder() + .optional() + .digit() + .to_regex() + ) + +.. _zero_or_more: + +.zero_or_more() +--------------- + +``.zero_or_more()`` asserts that the proceeding element may not be matched, or may be matched multiple times. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d*') + expr = ( + RegexBuilder() + .zero_or_more() + .digit() + .to_regex() + ) + +.. _zero_or_more_lazy: + +.zero_or_more_lazy() +-------------------- + +``.zero_or_more_lazy()`` asserts that the proceeding element may not be matched, or may be matched multiple times, but as few times as possible. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d*?') + expr = ( + RegexBuilder() + .zero_or_more_lazy() + .digit() + .to_regex() + ) + +.. _one_or_more: + +.one_or_more() +-------------- + +``.one_or_more()`` asserts that the proceeding element may be matched once or more times. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d+') + expr = ( + RegexBuilder() + .one_or_more() + .digit() + .to_regex() + ) + +.. _one_or_more_lazy: + +.one_or_more_lazy() +------------------- + +``.one_or_more_lazy()`` asserts that the proceeding element may be matched once or more times, but as few times as possible. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d+?') + expr = ( + RegexBuilder() + .one_or_more_lazy() + .digit() + .to_regex() + ) + +.. _exactly: + +.exactly(n) +----------- + +``.exactly(n)`` asserts that the proceeding element will be matched exactly ``n`` times. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d{3}') + expr = ( + RegexBuilder() + .exactly(3) + .digit() + .to_regex() + ) + +.. _at_least: + +.at_least(n) +------------ + +``.at_least(n)`` asserts that the proceeding element will be matched at least ``n`` times. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d{3,}') + expr = ( + RegexBuilder() + .at_least(3) + .digit() + .to_regex() + ) + +.. _between: + +.between(n, m) +-------------- + +``.between(n, m)`` asserts that the proceeding element will be matched between ``n`` and ``m`` times. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d{3,5}') + expr = ( + RegexBuilder() + .between(3, 5) + .digit() + .to_regex() + ) + +.. _between_lazy: + +.between_lazy(n, m) +-------------------- + +``.between_lazy(n, m)`` asserts that the proceeding element will be matched between ``n`` and ``m`` times, but as few times as possible. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('\d{3,5}?') + expr = ( + RegexBuilder() + .between_lazy(3, 5) + .digit() + .to_regex() + ) + +.. _start_of_input: + +.start_of_input() +----------------- + +``.start_of_input()`` asserts the start of input, or the start of a line when ``M`` flag is used. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('^hello') + expr = ( + RegexBuilder() + .start_of_input() + .string('hello') + .to_regex() + ) + +.. _end_of_input: + +.end_of_input() +--------------- + +``.end_of_input()`` asserts the end of input, or the end of a line when ``M`` flag is used. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('hello$') + expr = ( + RegexBuilder() + .string('hello') + .end_of_input() + .to_regex() + ) + +.. _any_of_chars: + +.any_of_chars(chars) +-------------------- + +``.any_of_chars(chars)`` matches any of the characters in the provided string ``chars``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('[abc]') + expr = ( + RegexBuilder() + .any_of_chars('abc') + .to_regex() + ) + +.. _anything_but_chars: + +.anything_but_chars(chars) +-------------------------- + +``.anything_but_chars(chars)`` matches any character except those in the provided string ``chars``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('[^abc]') + expr = ( + RegexBuilder() + .anything_but_chars('abc') + .to_regex() + ) + +.. _anything_but_string: + +.anything_but_string(string) +---------------------------- + +``.anything_but_string(string)`` matches any string the same length as ``string``, except the characters sequentially defined in ``string``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('(?:[^a][^e][^i][^o][^u])') + expr = ( + RegexBuilder() + .anything_but_string('aeiou') + .to_regex() + ) + +.. _anything_but_range: + +.anything_but_range(start, end) +-------------------------------- + +``.anything_but_range(start, end)`` matches any character except those that would be captured by the :ref:`range` specified by ``start`` and ``end``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('[^a-z]') + expr = ( + RegexBuilder() + .anything_but_range('a', 'z') + .to_regex() + ) + +.. _string: + +.string(s) +--------------- + +``.string(string)`` matches the exact string ``s``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('hello') + expr = ( + RegexBuilder() + .string('hello') + .to_regex() + ) + +.. _char: + +.char(c) +-------- + +``.char(c)`` matches the exact character ``c``. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('h') + expr = ( + RegexBuilder() + .char('h') + .to_regex() + ) + +.. _range: + +.range(start, end) +------------------ + +``.range(start, end)`` matches any character that falls between ``start`` and ``end``. Ordering is defined by a characters ASCII or unicode value. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('[a-z]') + expr = ( + RegexBuilder() + .range('a', 'z') + .to_regex() + ) + +.. _subexpression: + +.subexpression(expr, opts) +-------------------------- + +``.subexpression()`` matches another ``RegexBuilder`` instance inline. Can be used to create libraries, or to modularise you code. By default, flags and start/end of input markers are ignored, but can be explcitly turned on in the options object. + +``opts`` is an optional dictionary that can be used to control how the subexpression is treated. It has the following properties: + + ``namespace`` + A string namespace to use on all named capture groups in the subexpression, to avoid naming collisions with your own named groups. Defaults to ``' '``. + + ``ignore_flags`` + If set to ``True``, any flags this subexpression specifies should be disregarded. Defaults to ``True``. + + ``ignore_start_and_end`` + If set to ``True``, any start_of_input/end_of_input asserted in this subexpression specifies should be disregarded. Defaults to ``True``. + +A sample ``opts`` dictionary might look like this:: + + opts = { + 'namespace': 'my_namespace', + 'ignore_flags': False, + 'ignore_start_and_end': False + } + +You can use the ``.subexpression()`` method like this: + + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('[a-z]+.{3,}\d{5}') + five_digits = RegexBuilder().exactly(5).digit() + expr = ( + RegexBuilder() + .one_or_more().range('a', 'z') + .at_least(3).any_char() + .subexpression(five_digits) + .to_regex() + ) + +.. _to_regex_string: + +.to_regex_string() +------------------ + +``.to_regex_string()`` returns a string representation of the regular expression that this ``RegexBuilder`` instance represents. + +.. code-block:: python + + from edify import RegexBuilder + + # returns '/^(?:0x)?([A-Fa-f0-9]{4})$/IM' + expr = ( + RegexBuilder() + .ignore_case() + .multiline() + .start_of_input() + .optional().string('0x') + .capture() + .exactly(4).any_of() + .range('A', 'F') + .range('a', 'f') + .range('0', '9') + .end() + .end() + .end_of_input() + .to_regex_string() + ) + +.. _to_regex: + +.to_regex() +----------- + +``.to_regex()`` returns a compiled regular expression object that this ``RegexBuilder`` instance represents. The complied regular expression is an instance of ``re.compile``, so any ``re`` module methods like ``.search()``, ``.match()``, ``.findall()``, etc. can be used on it. + +.. code-block:: python + + from edify import RegexBuilder + + # returns re.compile('^(?:0x)?([A-Fa-f0-9]{4})$', re.MULTILINE | re.IGNORECASE) + expr = ( + RegexBuilder() + .ignore_case() + .multiline() + .start_of_input() + .optional().string('0x') + .capture() + .exactly(4).any_of() + .range('A', 'F') + .range('a', 'f') + .range('0', '9') + .end() + .end() + .end_of_input() + .to_regex() + ) + + # returns re.Match object + expr.match('0x1234') diff --git a/docs/regex-builder/flags/index.rst b/docs/regex-builder/flags/index.rst new file mode 100644 index 0000000..3afdd3f --- /dev/null +++ b/docs/regex-builder/flags/index.rst @@ -0,0 +1,96 @@ +Flags +===== + +Flags in Edify are same as the flags in ``re`` module. Edify supports the following flags: + - ``A``: ASCII (standard ASCII) character + - ``D``: DEBUG, returns ``re.DEBUG`` + - ``I``: Ignore Case + - ``M``: Multi Line + - ``S``: Dot All + - ``X``: Verbose + +To learn more about the flags, please refer to the ``re`` module documentation. If you wish to use the ``/g`` or any other unsupported flag, you can use the ``re.search`` or ``re.match`` methods, according to your needs. If you need to support extra flags, you can try looking at the `regex <https://pypi.python.org/pypi/regex>`_ package. To get started, import the ``RegexBuilder`` class:: + + from edify import RegexBuilder + + +ASCII Only Matching +-------------------- + +Make ``\w``, ``\W``, ``\b``, ``\B``, ``\d``, ``\D``, ``\s``, and ``\S`` perform ASCII-only matching instead of full Unicode matching. This is only meaningful for Unicode patterns, and is ignored for byte patterns. Corresponds to the inline flag ``(?a)``. + +Example +^^^^^^^ + +.. code-block:: python + + # returns re.compile('hello', re.ASCII) + expr = RegexBuilder().ascii_only().string('hello').to_regex() + +Display Debug Information +------------------------- +Display debug information about compiled expression. No corresponding inline flag. + +Example +^^^^^^^ + +.. code-block:: python + + # returns re.compile('hello', re.DEBUG) + expr = RegexBuilder().debug().string('hello').to_regex() + + +Ignore Case +------------ +Perform case-insensitive matching; expressions like ``[A-Z]`` will also match lowercase letters. Full Unicode matching (such as ``Ü`` matching ``ü``) also works unless the ``re.ASCII`` flag is used to disable non-ASCII matches. Corresponds to the inline flag ``(?i)``. + +Example +^^^^^^^ + +.. code-block:: python + + # returns re.compile('hello', re.IGNORECASE) + expr = RegexBuilder().ignore_case().string('hello').to_regex() + + +Multi Line +---------- +When specified, the pattern character ``'^'`` matches at the beginning of the string and at the beginning of each line (immediately following each newline); and the pattern character ``'$'`` matches at the end of the string and at the end of each line (immediately preceding each newline). By default, ``'^'`` matches only at the beginning of the string, and ``'$'`` only at the end of the string and immediately before the newline (if any) at the end of the string. Corresponds to the inline flag ``(?m)``. + + +Example +^^^^^^^ + +.. code-block:: python + + # returns re.compile('hello', re.MULTILINE) + expr = RegexBuilder().multi_line().string('hello').to_regex() + + +Dot All +------- + +Make the ``'.'`` special character match any character at all, including a newline; without this flag, ``'.'`` will match anything *except* a newline. Corresponds to the inline flag ``(?s)``. + + +Example +^^^^^^^ + +.. code-block:: python + + # returns re.compile('hello', re.DOTALL) + expr = RegexBuilder().dot_all().string('hello').to_regex() + + +Verbose +------- +This workd same as the ``re.VERBOSE`` flag, which allows you to write regular expressions that look nicer and are more readable by allowing you to visually separate logical sections of the pattern and add comments. However, this flag is basically rendered useless with Edify, but it is still available for use to keep the API consistent with the ``re`` module. Corresponds to the inline flag ``(?x)``. + + +Example +^^^^^^^ + +.. code-block:: python + + # returns re.compile('hello', re.VERBOSE) + expr = RegexBuilder().verbose().string('hello').to_regex() diff --git a/docs/regex-builder/index.rst b/docs/regex-builder/index.rst new file mode 100644 index 0000000..a52dac7 --- /dev/null +++ b/docs/regex-builder/index.rst @@ -0,0 +1,9 @@ +RegexBuilder API Reference +========================== + +.. toctree:: + :glob: + :maxdepth: 2 + + flags/index + builder/index diff --git a/docs/usage.rst b/docs/usage.rst deleted file mode 100644 index 7cce1cb..0000000 --- a/docs/usage.rst +++ /dev/null @@ -1,7 +0,0 @@ -===== -Usage -===== - -To use Edify in a project:: - - import edify diff --git a/images/cover.png b/images/cover.png Binary files differnew file mode 100644 index 0000000..24c06d9 --- /dev/null +++ b/images/cover.png diff --git a/images/logo_1500px.png b/images/logo_1500px.png Binary files differnew file mode 100644 index 0000000..9c7c75b --- /dev/null +++ b/images/logo_1500px.png diff --git a/images/logo_500px.png b/images/logo_500px.png Binary files differnew file mode 100644 index 0000000..6f77726 --- /dev/null +++ b/images/logo_500px.png diff --git a/images/logo_no_text_1500px.png b/images/logo_no_text_1500px.png Binary files differnew file mode 100644 index 0000000..22804b5 --- /dev/null +++ b/images/logo_no_text_1500px.png diff --git a/images/logo_no_text_500px.png b/images/logo_no_text_500px.png Binary files differnew file mode 100644 index 0000000..bbcfc41 --- /dev/null +++ b/images/logo_no_text_500px.png diff --git a/images/logo_no_text_black_1500px.png b/images/logo_no_text_black_1500px.png Binary files differnew file mode 100644 index 0000000..18ab859 --- /dev/null +++ b/images/logo_no_text_black_1500px.png diff --git a/images/logo_no_text_black_500px.png b/images/logo_no_text_black_500px.png Binary files differnew file mode 100644 index 0000000..9bf361c --- /dev/null +++ b/images/logo_no_text_black_500px.png diff --git a/images/logo_no_text_t_1500px.png b/images/logo_no_text_t_1500px.png Binary files differnew file mode 100644 index 0000000..0f476a9 --- /dev/null +++ b/images/logo_no_text_t_1500px.png diff --git a/images/logo_no_text_t_500px.png b/images/logo_no_text_t_500px.png Binary files differnew file mode 100644 index 0000000..5c507e7 --- /dev/null +++ b/images/logo_no_text_t_500px.png diff --git a/images/logo_no_text_t_black_1500px.png b/images/logo_no_text_t_black_1500px.png Binary files differnew file mode 100644 index 0000000..3371816 --- /dev/null +++ b/images/logo_no_text_t_black_1500px.png diff --git a/images/logo_no_text_t_black_500px.png b/images/logo_no_text_t_black_500px.png Binary files differnew file mode 100644 index 0000000..194cd13 --- /dev/null +++ b/images/logo_no_text_t_black_500px.png diff --git a/images/logo_t_1500px.png b/images/logo_t_1500px.png Binary files differnew file mode 100644 index 0000000..7e7ebe1 --- /dev/null +++ b/images/logo_t_1500px.png diff --git a/images/logo_t_500px.png b/images/logo_t_500px.png Binary files differnew file mode 100644 index 0000000..7a10cb0 --- /dev/null +++ b/images/logo_t_500px.png diff --git a/images/logo_t_black_1500px.png b/images/logo_t_black_1500px.png Binary files differnew file mode 100644 index 0000000..abf3dd2 --- /dev/null +++ b/images/logo_t_black_1500px.png diff --git a/images/logo_t_black_500px.png b/images/logo_t_black_500px.png Binary files differnew file mode 100644 index 0000000..649b3de --- /dev/null +++ b/images/logo_t_black_500px.png diff --git a/images/logo_wob_1500px.png b/images/logo_wob_1500px.png Binary files differnew file mode 100644 index 0000000..acf4848 --- /dev/null +++ b/images/logo_wob_1500px.png diff --git a/images/logo_wob_500px.png b/images/logo_wob_500px.png Binary files differnew file mode 100644 index 0000000..76c83c2 --- /dev/null +++ b/images/logo_wob_500px.png diff --git a/pyproject.toml b/pyproject.toml index a51acf9..fc9d53d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,5 +6,5 @@ requires = [ [tool.black] line-length = 140 -target-version = ['py36'] +target-version = ['py37'] skip-string-normalization = true @@ -67,7 +67,7 @@ setup( keywords=[ # eg: 'keyword1', 'keyword2', 'keyword3', ], - python_requires='>=3.6', + python_requires='>=3.7', install_requires=[ # eg: 'aspectlib==1.1.1', 'six>=1.7', ], diff --git a/src/edify/__init__.py b/src/edify/__init__.py index b794fd4..e710eb6 100644 --- a/src/edify/__init__.py +++ b/src/edify/__init__.py @@ -1 +1,4 @@ +# flake8: noqa + __version__ = '0.1.0' +from .builder.builder import RegexBuilder diff --git a/src/edify/builder/builder.py b/src/edify/builder/builder.py new file mode 100644 index 0000000..666468b --- /dev/null +++ b/src/edify/builder/builder.py @@ -0,0 +1,532 @@ +import re +from copy import deepcopy as clone + +from .errors import can_not_call_se +from .errors import can_not_end_while_building_root_exp +from .errors import cannot_create_duplicate_named_group +from .errors import cannot_define_start_after_end +from .errors import end_input_already_defined +from .errors import ignore_se +from .errors import invalid_total_capture_groups_index +from .errors import must_be_a_string +from .errors import must_be_instance +from .errors import must_be_integer_greater_than_zero +from .errors import must_be_one_character +from .errors import must_be_positive_integer +from .errors import must_be_single_character +from .errors import must_have_a_smaller_value +from .errors import name_not_valid +from .errors import named_group_does_not_exist +from .errors import start_input_already_defined +# from .helpers.core import deep_copy +# from .errors import unable_to_quantify +from .helpers.core import apply_subexpression_defaults +from .helpers.core import assertion +from .helpers.core import create_stack_frame +from .helpers.core import escape_special +from .helpers.core import fuse_elements +from .helpers.quantifiers import quantifier_table +from .helpers.regex_vars import named_group_regex +from .helpers.t import t + + +class RegexBuilder: + """Regular Expression Builder Class.""" + + state = {} + + def __init__(self): + self.state = { + 'has_defined_start': False, + 'has_defined_end': False, + 'flags': { + 'A': False, + 'D': False, + 'I': False, + 'M': False, + 'S': False, + 'X': False, + }, + 'stack': [create_stack_frame(t['root'])], + 'named_groups': [], + 'total_capture_groups': 0, + } + + def ascii_only(self): + next = clone(self) + next.state['flags']['A'] = True + return next + + def debug(self): + next = clone(self) + next.state['flags']['D'] = True + return next + + def ignore_case(self): + next = clone(self) + next.state['flags']['I'] = True + return next + + def multi_line(self): + next = clone(self) + next.state['flags']['M'] = True + return next + + def dot_all(self): + next = clone(self) + next.state['flags']['S'] = True + return next + + def verbose(self): + next = clone(self) + next.state['flags']['X'] = True + return next + + def get_current_frame(self): + return self.state['stack'][len(self.state['stack']) - 1] + + def get_current_element_array(self): + return self.get_current_frame()['elements'] + + def match_element(self, type_fn): + next = clone(self) + next.get_current_element_array().append(next.apply_quantifier(type_fn)) + return next + + def apply_quantifier(self, element): + current_frame = self.get_current_frame() + if current_frame['quantifier'] is not None: + wrapped = current_frame['quantifier']['value'](element) + current_frame['quantifier'] = None + return wrapped + return element + + def frame_creating_element(self, type_fn): + next = clone(self) + new_frame = create_stack_frame(type_fn) + next.state['stack'].append(new_frame) + return next + + def tracked_named_group(self, name): + assertion(type(name) is str, must_be_a_string("Name", type(name))) + assertion(len(name) > 0, must_be_one_character("Name")) + assertion(name not in self.state['named_groups'], cannot_create_duplicate_named_group(name)) + assertion(re.compile(named_group_regex, re.I).match(name), name_not_valid(name)) + self.state['named_groups'].append(name) + + def capture(self): + next = clone(self) + new_frame = create_stack_frame(t['capture']) + next.state['stack'].append(new_frame) + next.state['total_capture_groups'] += 1 + return next + + def named_capture(self, name): + next = clone(self) + new_frame = create_stack_frame(t['named_capture'](name)) + next.tracked_named_group(name) + next.state['stack'].append(new_frame) + next.state['total_capture_groups'] += 1 + return next + + def quantifier_element(self, type_fn): + next = clone(self) + current_frame = next.get_current_frame() + # if current_frame['quantifier'] is not None: + # raise Exception(unable_to_quantify(type_fn, current_frame['quantifier']['type'])) + current_frame['quantifier'] = t[type_fn] + return next + + def any_char(self): + return self.match_element(t['any_char']) + + def whitespace_char(self): + return self.match_element(t['whitespace_char']) + + def non_whitespace_char(self): + return self.match_element(t['non_whitespace_char']) + + def digit(self): + return self.match_element(t['digit']) + + def non_digit(self): + return self.match_element(t['non_digit']) + + def word(self): + return self.match_element(t['word']) + + def non_word(self): + return self.match_element(t['non_word']) + + def word_boundary(self): + return self.match_element(t['word_boundary']) + + def non_word_boundary(self): + return self.match_element(t['non_word_boundary']) + + def new_line(self): + return self.match_element(t['new_line']) + + def carriage_return(self): + return self.match_element(t['carriage_return']) + + def tab(self): + return self.match_element(t['tab']) + + def null_byte(self): + return self.match_element(t['null_byte']) + + def named_back_reference(self, name): + assertion(name in self.state['named_groups'], named_group_does_not_exist(name)) + return self.match_element(t['named_back_reference'](name)) + + def back_reference(self, index: int): + assertion(type(index) is int, 'Index must be an integer.') + assertion( + index > 0 and index <= self.state['total_capture_groups'], + invalid_total_capture_groups_index(index, self.state['total_capture_groups']), + ) + return self.match_element(t['back_reference'](index)) + + def any_of(self): + return self.frame_creating_element(t['any_of']) + + def group(self): + return self.frame_creating_element(t['group']) + + def assert_ahead(self): + return self.frame_creating_element(t['assert_ahead']) + + def assert_not_ahead(self): + return self.frame_creating_element(t['assert_not_ahead']) + + def assert_behind(self): + return self.frame_creating_element(t['assert_behind']) + + def assert_not_behind(self): + return self.frame_creating_element(t['assert_not_behind']) + + def optional(self): + return self.quantifier_element('optional') + + def zero_or_more(self): + return self.quantifier_element('zero_or_more') + + def zero_or_more_lazy(self): + return self.quantifier_element('zero_or_more_lazy') + + def one_or_more(self): + return self.quantifier_element('one_or_more') + + def one_or_more_lazy(self): + return self.quantifier_element('one_or_more_lazy') + + def exactly(self, count): + assertion(type(count) is int and count > 0, must_be_positive_integer('count')) + current_frame = self.get_current_frame() + # if current_frame['quantifier'] is not None: + # raise Exception(unable_to_quantify("exactly", current_frame['quantifier']['type'])) + current_frame['quantifier'] = t['exactly'](count) + return self + + def at_least(self, count): + assertion(type(count) is int and count > 0, must_be_positive_integer('count')) + next = clone(self) + current_frame = next.get_current_frame() + # if current_frame['quantifier'] is not None: + # raise Exception(unable_to_quantify("at_least", current_frame['quantifier']['type'])) + current_frame['quantifier'] = t['at_least'](count) + return next + + def between(self, x, y): + assertion(type(x) is int and x >= 0, must_be_integer_greater_than_zero('x')) + assertion(type(y) is int and y > 0, must_be_positive_integer('y')) + assertion(x < y, 'X must be less than Y.') + next = clone(self) + current_frame = next.get_current_frame() + # if current_frame['quantifier'] is not None: + # raise Exception(unable_to_quantify("between", current_frame['quantifier']['type'])) + current_frame['quantifier'] = t['between'](x, y) + return next + + def between_lazy(self, x, y): + assertion(type(x) is int and x >= 0, must_be_integer_greater_than_zero('x')) + assertion(type(y) is int and y > 0, must_be_positive_integer('y')) + assertion(x < y, 'X must be less than Y.') + next = clone(self) + current_frame = next.get_current_frame() + # if current_frame['quantifier'] is not None: + # raise Exception(unable_to_quantify("between_lazy", current_frame['quantifier']['type'])) + current_frame['quantifier'] = t['between_lazy'](x, y) + return next + + def start_of_input(self): + assertion(self.state['has_defined_start'] is False, start_input_already_defined()) + assertion(self.state['has_defined_end'] is False, cannot_define_start_after_end()) + next = clone(self) + next.state['has_defined_start'] = True + next.get_current_element_array().append(t['start_of_input']) + return next + + def end_of_input(self): + assertion(self.state['has_defined_end'] is False, end_input_already_defined()) + next = clone(self) + next.state['has_defined_end'] = True + next.get_current_element_array().append(t['end_of_input']) + return next + + def any_of_chars(self, chars): + next = clone(self) + element_value = t['any_of_chars'](escape_special(chars)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def end(self): + assertion(len(self.state['stack']) > 1, can_not_end_while_building_root_exp()) + next = clone(self) + old_frame = next.state['stack'].pop() + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(old_frame['type']['value'](old_frame['elements']))) + return next + + def anything_but_string(self, string): + assertion(type(string) is str, must_be_a_string('Value', string)) + assertion(len(string) > 0, must_be_one_character('Value')) + next = clone(self) + element_value = t['anything_but_string'](escape_special(string)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def anything_but_chars(self, chars): + assertion(type(chars) is str, must_be_a_string('Value', chars)) + assertion(len(chars) > 0, must_be_one_character('Value')) + next = clone(self) + element_value = t['anything_but_chars'](escape_special(chars)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def anything_but_range(self, a, b): + str_a = str(a) + str_b = str(b) + assertion(len(str_a) == 1, must_be_single_character('a', str_a)) + assertion(len(str_b) == 1, must_be_single_character('b', str_b)) + assertion(ord(str_a) < ord(str_b), must_have_a_smaller_value(str_a, str_b)) + next = clone(self) + element_value = t['anything_but_range']([a, b]) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def string(self, s): + assertion(type(s) is str, must_be_a_string('Value', s)) + assertion(len(s) > 0, must_be_one_character('Value')) + next = clone(self) + element_value = t['string'](escape_special(s)) if len(s) > 1 else t['char'](escape_special(s)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def char(self, c): + assertion(type(c) is str, must_be_a_string('Value', c)) + assertion(len(c) == 1, must_be_single_character('Value', c)) + next = clone(self) + element_value = t['char'](escape_special(c)) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def range(self, a, b): + str_a = str(a) + str_b = str(b) + assertion(len(str_a) == 1, must_be_single_character('a', str_a)) + assertion(len(str_b) == 1, must_be_single_character('b', str_b)) + assertion(ord(str_a) < ord(str_b), must_have_a_smaller_value(str_a, str_b)) + next = clone(self) + element_value = t['range']([a, b]) + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(element_value)) + return next + + def merge_subexpression(self, el, options, parent, increment_capture_groups): + next_el = clone(el) + + if next_el['type'] == 'back_reference': + next_el['index'] += parent.state['total_capture_groups'] + if next_el['type'] == 'capture': + increment_capture_groups() + if next_el['type'] == 'named_capture': + group_name = '{}{}'.format(options['namespace'], next_el['name']) if options['namespace'] else next_el['name'] + # parent.tracked_named_group(group_name) + next_el['name'] = group_name + if next_el['type'] == 'named_back_reference': + next_el['name'] = '{}{}'.format(options['namespace'], next_el['name']) if options['namespace'] else next_el['name'] + if 'contains_child' in next_el: + next_el['value'] = self.merge_subexpression(next_el['value'], options, parent, increment_capture_groups) + elif 'contains_children' in next_el: + next_el['value'] = list(map(lambda e: self.merge_subexpression(e, options, parent, increment_capture_groups), next_el['value'])) + if next_el['type'] == 'start_of_input': + if options['ignore_start_and_end']: + return t['noop'] + assertion(parent.state['has_defined_start'] is False, str(start_input_already_defined()) + " " + str(ignore_se())) + # assertion(parent.state['has_defined_end'] is False, str(end_input_already_defined()) + " " + str(ignore_se())) + # parent.state['has_defined_start'] = True + if next_el['type'] == 'end_of_input': + if options['ignore_start_and_end']: + return t['noop'] + assertion(parent.state['has_defined_end'] is False, str(end_input_already_defined()) + str(ignore_se())) + # parent.state['has_defined_end'] = True + return next_el + + def subexpression(self, expr, opts={}): + assertion(isinstance(expr, RegexBuilder), must_be_instance("Expression", expr, "RegexBuilder")) + assertion(len(expr.state['stack']) == 1, can_not_call_se(expr.get_current_frame()['type']['type'])) + options = apply_subexpression_defaults(opts) + expr_next = clone(expr) + next = clone(self) + additional_capture_groups = 0 + expr_frame = expr_next.get_current_frame() + + def increment_capture_groups(): + nonlocal additional_capture_groups + additional_capture_groups += 1 + + expr_frame['elements'] = list( + map(lambda e: self.merge_subexpression(e, options, expr_next, increment_capture_groups), expr_frame['elements']) + ) + next.state['total_capture_groups'] += additional_capture_groups + if not options['ignore_flags']: + for flag_name, enabled in expr_next.state['flags'].items(): + next.state['flags'][flag_name] = enabled or next.state['flags'][flag_name] + current_frame = next.get_current_frame() + current_frame['elements'].append(next.apply_quantifier(t['subexpression'](expr_frame['elements']))) + return next + + def evaluate(self, el): + if el['type'] == 'noop': + return '' + if el['type'] == 'any_char': + return '.' + if el['type'] == 'whitespace_char': + return '\\s' + if el['type'] == 'non_whitespace_char': + return '\\S' + if el['type'] == 'digit': + return '\\d' + if el['type'] == 'non_digit': + return '\\D' + if el['type'] == 'word': + return '\\w' + if el['type'] == 'non_word': + return '\\W' + if el['type'] == 'word_boundary': + return '\\b' + if el['type'] == 'non_word_boundary': + return '\\B' + if el['type'] == 'start_of_input': + return '^' + if el['type'] == 'end_of_input': + return '$' + if el['type'] == 'new_line': + return '\\n' + if el['type'] == 'carriage_return': + return '\\r' + if el['type'] == 'tab': + return '\\t' + if el['type'] == 'null_byte': + return '\\0' + if el['type'] == 'string': + return el['value'] + if el['type'] == 'char': + return el['value'] + if el['type'] == 'range': + return '[{}-{}]'.format(el['value'][0], el['value'][1]) + if el['type'] == 'anything_but_range': + return '[^{}-{}]'.format(el['value'][0], el['value'][1]) + if el['type'] == 'any_of_chars': + return '[' + ''.join(el['value']) + ']' + if el['type'] == 'anything_but_chars': + return '[^' + ''.join(el['value']) + ']' + if el['type'] == 'named_back_reference': + return '\\k<{}>'.format(el['name']) + if el['type'] == 'back_reference': + return '\\{}'.format(el['index']) + if el['type'] == 'subexpression': + return ''.join(map(lambda e: self.evaluate(e), el['value'])) + cg1 = ['optional', 'zero_or_more', 'zero_or_more_lazy', 'one_or_more', 'one_or_more_lazy'] + if el['type'] in cg1: + inner = self.evaluate(el['value']) + with_group = "(?:{})".format(inner) if 'quantifiers_require_group' in el['value'] else inner + symbol = quantifier_table[el['type']] + return '{}{}'.format(with_group, symbol) + cg2 = ['between', 'between_lazy', 'at_least', 'exactly'] + if el['type'] in cg2: + inner = self.evaluate(el['value']) + with_group = "(?:{})".format(inner) if 'quantifiers_require_group' in el['value'] else inner + return '{}{}'.format(with_group, quantifier_table[el['type']](el['times'])) + if el['type'] == 'anything_but_string': + chars = ''.join(map(lambda c: '[^{}]'.format(c), el['value'])) + return '(?:{})'.format(chars) + if el['type'] == 'assert_ahead': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?={})'.format(evaluated) + if el['type'] == 'assert_behind': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?<={})'.format(evaluated) + if el['type'] == 'assert_not_ahead': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?!{})'.format(evaluated) + if el['type'] == 'assert_not_behind': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?<!{})'.format(evaluated) + if el['type'] == 'any_of': + [fused, rest] = fuse_elements(el['value']) + if len(rest) == 0: + return '[{}]'.format(fused) + evaluated_rest = list(map(lambda e: self.evaluate(e), rest)) + separator = '|' if len(evaluated_rest) > 0 and len(fused) > 0 else '' + return '(?:{}{}{})'.format('|'.join(evaluated_rest), separator, '[{}]'.format(fused) if fused else '') + if el['type'] == 'capture': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '({})'.format(evaluated) + if el['type'] == 'named_capture': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?P<{}>{})'.format(el['name'], evaluated) + if el['type'] == 'group': + evaluated = ''.join(map(lambda e: self.evaluate(e), el['value'])) + return '(?:{})'.format(evaluated) + + raise Exception('Can not process unsupported element type: {}'.format(el['type'])) # pragma: no cover + + def get_regex_patterns_and_flags(self): + assertion(len(self.state['stack']) == 1, can_not_call_se(self.get_current_frame()['type']['type'])) + pattern = "".join(list(map(lambda e: self.evaluate(e), self.get_current_element_array()))) + flags = "" + for flag_name, enabled in self.state['flags'].items(): + if enabled: + flags += flag_name + pattern = "(?:)" if pattern == "" else pattern + flags = "".join(sorted(flags)) + return pattern, flags + + def to_regex_string(self): + patterns, flags = self.get_regex_patterns_and_flags() + return '/{}/{}'.format(str(patterns.replace('\\ ', ' ')), str(flags)) + + def to_regex(self): + patterns, flags = self.get_regex_patterns_and_flags() + patterns = r"{}".format(patterns.replace("\\ ", ' ')) + flag = 0 + if flags != '': + for flag_name in flags: + if flag_name == 'D': + flag |= getattr(re, 'DEBUG') + else: + flag |= getattr(re, flag_name) + + try: + return re.compile(patterns, flags=flag) + except Exception as e: + raise Exception('Can not compile regex: {}'.format(e)) diff --git a/src/edify/builder/errors.py b/src/edify/builder/errors.py new file mode 100644 index 0000000..17a2bef --- /dev/null +++ b/src/edify/builder/errors.py @@ -0,0 +1,73 @@ +def must_be_a_string(value, variable_name): + return '{} must be a string. (got {})'.format(value, type(variable_name)) + + +def must_be_one_character(variable_name): + return '{} must be one character long.'.format(variable_name) + + +def cannot_create_duplicate_named_group(name): + return 'Can not create duplicate named group "{}".'.format(name) + + +def name_not_valid(name): + return 'Name {} is not valid. (only alphanumeric characters and underscores are allowed)'.format(name) + + +def named_group_does_not_exist(name): + return 'Named group "{}" does not exist (create one with .named_capture()).'.format(name) + + +def invalid_total_capture_groups_index(index, total_capture_groups): + return 'Invalid index #{}. There are only {} capture groups.'.format(index, total_capture_groups) + + +def must_be_positive_integer(variable_name): + return '{} must be a positive integer.'.format(variable_name) + + +def must_be_integer_greater_than_zero(variable_name): + return '{} must be an integer greater than zero.'.format(variable_name) + + +# def unable_to_quantify(quantifier, type): +# return 'Can not quantify regular expression with {}, because it has already been quantified with {}.'.format(quantifier, type) + + +def start_input_already_defined(): + return 'Regex already has a start of input.' + + +def cannot_define_start_after_end(): + return 'Can not define a start of input after defining an end of input.' + + +def end_input_already_defined(): + return 'Regex already has an end of input.' + + +def can_not_end_while_building_root_exp(): + return 'Can not end while building the root expression.' + + +def must_be_single_character(value, variable_name): + return '{} must be a single character. (got {})'.format(value, type(variable_name)) + + +def must_have_a_smaller_value(a, b): + return '{} must have a smaller character value than {}. (a = {}, b = {})'.format(a, b, ord(a), ord(b)) + + +def ignore_se(): + return 'You can ignore a subexpressions start_of_input/end_of_input markers with the ignore_start_and_end option' + + +def must_be_instance(value, variable_name, class_name): + return '{} must be an instance of {}. (got {})'.format(value, class_name, type(variable_name)) + + +def can_not_call_se(cft): + return "Can not call subexpression a not yet fully specified regex object. \ + \n (Try adding a .end() call to match the {} on the subexpression)".format( + cft + ) diff --git a/src/edify/builder/helpers/core.py b/src/edify/builder/helpers/core.py new file mode 100644 index 0000000..f8cabe3 --- /dev/null +++ b/src/edify/builder/helpers/core.py @@ -0,0 +1,79 @@ +import re + + +def as_type(type, opts={}): + def type_fn(value=None): + return { + 'type': type, + 'value': value, + **opts, + } + + return type_fn + + +def deferred_type(type, opts={}): + type_fn = as_type(type, opts) + return type_fn(type_fn) + + +def create_stack_frame(type): + return { + 'type': type, + 'quantifier': None, + 'elements': [], + } + + +def assertion(condition, message): + if not condition: + raise Exception(message) + + +def escape_special(s): + return re.escape(s) + + +# def deep_copy(o): +# if isinstance(o, list): +# return [deep_copy(e) for e in o] +# if isinstance(o, dict): +# return {k: deep_copy(v) for k, v in o.items()} +# return o + + +def apply_subexpression_defaults(expr): + out = {**expr} + out['namespace'] = "" if 'namespace' not in out else out['namespace'] + out['ignore_flags'] = True if 'ignore_flags' not in out else out['ignore_flags'] + out['ignore_start_and_end'] = True if 'ignore_start_and_end' not in out else out['ignore_start_and_end'] + assertion(type(out['namespace']) == str, 'namespace must be a string') + assertion(type(out['ignore_flags']) == bool, 'ignore_flags must be a boolean') + assertion(type(out['ignore_start_and_end']) == bool, 'ignore_start_and_end must be a boolean') + return out + + +def is_fusable(element): + return element['type'] == 'range' or element['type'] == 'char' or element['type'] == 'any_of_chars' + + +def partition(pred, a): + result = [[], []] + for cur in a: + if pred(cur): + result[0].append(cur) + else: + result[1].append(cur) + return result + + +def fuse_elements(elements): + [fusables, rest] = partition(is_fusable, elements) + + def map_el(el): + if el['type'] == 'char' or el['type'] == 'any_of_chars': + return el['value'] + return '{}-{}'.format(el['value'][0], el['value'][1]) + + fused = ''.join(map(map_el, fusables)) + return [fused, rest] diff --git a/src/edify/builder/helpers/quantifiers.py b/src/edify/builder/helpers/quantifiers.py new file mode 100644 index 0000000..1810658 --- /dev/null +++ b/src/edify/builder/helpers/quantifiers.py @@ -0,0 +1,11 @@ +quantifier_table = { + 'one_or_more': '+', + 'one_or_more_lazy': '+?', + 'zero_or_more': '*', + 'zero_or_more_lazy': '*?', + 'optional': '?', + 'exactly': lambda times: '{{{}}}'.format(times), + 'at_least': lambda times: '{{{},}}'.format(times), + 'between': lambda times: '{{{},{}}}'.format(times[0], times[1]), + 'between_lazy': lambda times: '{{{},{}}}?'.format(times[0], times[1]), +} diff --git a/src/edify/builder/helpers/regex_vars.py b/src/edify/builder/helpers/regex_vars.py new file mode 100644 index 0000000..ebf8f28 --- /dev/null +++ b/src/edify/builder/helpers/regex_vars.py @@ -0,0 +1 @@ +named_group_regex = r"^[a-z]+\w*$" diff --git a/src/edify/builder/helpers/t.py b/src/edify/builder/helpers/t.py new file mode 100644 index 0000000..ab3304c --- /dev/null +++ b/src/edify/builder/helpers/t.py @@ -0,0 +1,49 @@ +from .core import as_type +from .core import deferred_type + +t = { + 'root': as_type('root')(), + 'noop': as_type('noop')(), + 'start_of_input': as_type('start_of_input')(), + 'end_of_input': as_type('end_of_input')(), + 'any_char': as_type('any_char')(), + 'whitespace_char': as_type('whitespace_char')(), + 'non_whitespace_char': as_type('non_whitespace_char')(), + 'digit': as_type('digit')(), + 'non_digit': as_type('non_digit')(), + 'word': as_type('word')(), + 'non_word': as_type('non_word')(), + 'word_boundary': as_type('word_boundary')(), + 'non_word_boundary': as_type('non_word_boundary')(), + 'new_line': as_type('new_line')(), + 'carriage_return': as_type('carriage_return')(), + 'tab': as_type('tab')(), + 'null_byte': as_type('null_byte')(), + 'any_of_chars': as_type('any_of_chars'), + 'anything_but_string': as_type('anything_but_string'), + 'anything_but_chars': as_type('anything_but_chars'), + 'anything_but_range': as_type('anything_but_range'), + 'char': as_type('char'), + 'range': as_type('range'), + 'string': as_type('string', {'quantifiers_require_group': True}), + 'named_back_reference': lambda name: deferred_type('named_back_reference', {'name': name}), + 'back_reference': lambda index: deferred_type('back_reference', {'index': index}), + 'capture': deferred_type('capture', {'contains_children': True}), + 'subexpression': as_type('subexpression', {'contains_children': True, 'quantifiers_require_group': True}), + 'named_capture': lambda name: deferred_type('named_capture', {'name': name, 'contains_children': True}), + 'group': deferred_type('group', {'contains_children': True}), + 'any_of': deferred_type('any_of', {'contains_children': True}), + 'assert_ahead': deferred_type('assert_ahead', {'contains_children': True}), + 'assert_not_ahead': deferred_type('assert_not_ahead', {'contains_children': True}), + 'assert_behind': deferred_type('assert_behind', {'contains_children': True}), + 'assert_not_behind': deferred_type('assert_not_behind', {'contains_children': True}), + 'exactly': lambda times: deferred_type('exactly', {'times': times, 'contains_child': True}), + 'at_least': lambda times: deferred_type('at_least', {'times': times, 'contains_child': True}), + 'between': lambda x, y: deferred_type('between', {'times': [x, y], 'contains_child': True}), + 'between_lazy': lambda x, y: deferred_type('between_lazy', {'times': [x, y], 'contains_child': True}), + 'zero_or_more': deferred_type('zero_or_more', {'contains_child': True}), + 'zero_or_more_lazy': deferred_type('zero_or_more_lazy', {'contains_child': True}), + 'one_or_more': deferred_type('one_or_more', {'contains_child': True}), + 'one_or_more_lazy': deferred_type('one_or_more_lazy', {'contains_child': True}), + 'optional': deferred_type('optional', {'contains_child': True}), +} diff --git a/src/edify/library/__init__.py b/src/edify/library/__init__.py new file mode 100644 index 0000000..5411d36 --- /dev/null +++ b/src/edify/library/__init__.py @@ -0,0 +1,5 @@ +# flake8: noqa + +# Import everything from the library. +from .mail import email +from .phone import phone_number diff --git a/src/edify/library/mail.py b/src/edify/library/mail.py new file mode 100644 index 0000000..ecb0cf2 --- /dev/null +++ b/src/edify/library/mail.py @@ -0,0 +1,18 @@ +import re + +pattern = r"^[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$" # noqa + + +def email(email: str) -> bool: + """Checks if a string is a valid email address. + + Args: + email (str): The string to check. + Returns: + bool: True if the string is a valid email address, False otherwise. + """ + + if re.match(pattern, email): + return True + else: + return False diff --git a/src/edify/library/phone.py b/src/edify/library/phone.py new file mode 100644 index 0000000..90ab535 --- /dev/null +++ b/src/edify/library/phone.py @@ -0,0 +1,18 @@ +import re + +pattern = "^\\+?\\d{1,4}?[-.\\s]?\\(?\\d{1,3}?\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}$" + + +def phone_number(phone: str) -> bool: + """Checks if a string is a valid phone number. + + Args: + phone (str): The string to check. + Returns: + bool: True if the string is a valid phone number, False otherwise. + """ + + if re.match(pattern, phone): + return True + else: + return False diff --git a/src/edify/main.py b/src/edify/main.py deleted file mode 100644 index ab326ce..0000000 --- a/src/edify/main.py +++ /dev/null @@ -1,4 +0,0 @@ -# Edify Package - -def main(): - return 0 diff --git a/tests.local.sh b/tests.local.sh new file mode 100755 index 0000000..031d6e4 --- /dev/null +++ b/tests.local.sh @@ -0,0 +1,37 @@ +# Clean tox environment +tox -e clean + +# Sort imports +isort . + +# Run Tests +tox -e check -v + +# Run Docs +tox -e docs -v + +# Get the current installed python version +PYTHON_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[:3])))') + +# Subset the python version to the major.minor version +PYTHON_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) + +if [ "$PYTHON_VERSION" = "3.7" ]; then + # Build using python 3.7 + tox -e py37 -v +elif [ "$PYTHON_VERSION" = "3.8" ]; then + # Build using python 3.8 + tox -e py38 -v +elif [ "$PYTHON_VERSION" = "3.9" ]; then + # Build using python 3.9 + tox -e py39 -v +elif [ "$PYTHON_VERSION" = "3.10" ]; then + # Build using python 3.10 + tox -e py310 -v +else + # Show error message + echo "Python version $PYTHON_VERSION is not supported" +fi + +# Run Coverage +tox -e report -v diff --git a/tests/test_builder.py b/tests/test_builder.py new file mode 100644 index 0000000..76a04b5 --- /dev/null +++ b/tests/test_builder.py @@ -0,0 +1,518 @@ +import re + +from edify import RegexBuilder + +simple_se = RegexBuilder().string('hello').any_char().string('world') +flags_se = RegexBuilder().multi_line().ignore_case().string('hello').any_char().string('world') +start_end_se = RegexBuilder().start_of_input().string('hello').any_char().string('world').end_of_input() +nc_se = RegexBuilder().named_capture('module').exactly(2).any_char().end().named_back_reference('module') +indexed_back_reference_se = RegexBuilder().capture().exactly(2).any_char().end().back_reference(1) +nested_se = RegexBuilder().exactly(2).any_char() +first_layer_se = ( + RegexBuilder().string('outer begin').named_capture('inner_subexpression').optional().subexpression(nested_se).end().string('outer end') +) + + +def regex_equality(regex, rb_expression): + regex_str = str(regex) + rb_expression_str = rb_expression.to_regex_string() + assert regex_str == str(rb_expression_str) + + +def regex_compilation(regex, rb_expression, f=0): + rb_expression_c = rb_expression.to_regex() + assert re.compile(regex, flags=f) == rb_expression_c + + +def test_empty_regex(): + expr = RegexBuilder() + regex_equality('/(?:)/', expr) + regex_compilation('(?:)', expr) + + +def test_flag_a(): + expr = RegexBuilder().ascii_only() + regex_equality('/(?:)/A', expr) + regex_compilation('(?:)', expr, re.A) + + +def test_flag_d(): + expr = RegexBuilder().debug() + regex_equality('/(?:)/D', expr) + regex_compilation('(?:)', expr, re.DEBUG) + + +def test_flag_i(): + expr = RegexBuilder().ignore_case() + regex_equality('/(?:)/I', expr) + regex_compilation('(?:)', expr, re.I) + + +def test_flag_m(): + expr = RegexBuilder().multi_line() + regex_equality('/(?:)/M', expr) + regex_compilation('(?:)', expr, re.M) + + +def test_flag_s(): + expr = RegexBuilder().dot_all() + regex_equality('/(?:)/S', expr) + regex_compilation('(?:)', expr, re.S) + + +def test_flag_x(): + expr = RegexBuilder().verbose() + regex_equality('/(?:)/X', expr) + regex_compilation('(?:)', expr, re.X) + + +def test_any_char(): + expr = RegexBuilder().any_char() + regex_equality('/./', expr) + regex_compilation('.', expr) + + +def test_whitespace_char(): + expr = RegexBuilder().whitespace_char() + regex_equality('/\\s/', expr) + regex_compilation('\\s', expr) + + +def test_non_whitespace_char(): + expr = RegexBuilder().non_whitespace_char() + regex_equality('/\\S/', expr) + regex_compilation('\\S', expr) + + +def test_digit(): + expr = RegexBuilder().digit() + regex_equality('/\\d/', expr) + regex_compilation('\\d', expr) + + +def test_non_digit(): + expr = RegexBuilder().non_digit() + regex_equality('/\\D/', expr) + regex_compilation('\\D', expr) + + +def test_word(): + expr = RegexBuilder().word() + regex_equality('/\\w/', expr) + regex_compilation('\\w', expr) + + +def test_non_word(): + expr = RegexBuilder().non_word() + regex_equality('/\\W/', expr) + regex_compilation('\\W', expr) + + +def test_word_boundary(): + expr = RegexBuilder().word_boundary() + regex_equality('/\\b/', expr) + regex_compilation('\\b', expr) + + +def test_non_word_boundary(): + expr = RegexBuilder().non_word_boundary() + regex_equality('/\\B/', expr) + regex_compilation('\\B', expr) + + +def test_new_line(): + expr = RegexBuilder().new_line() + regex_equality('/\\n/', expr) + regex_compilation('\\n', expr) + + +def test_carriage_return(): + expr = RegexBuilder().carriage_return() + regex_equality('/\\r/', expr) + regex_compilation('\\r', expr) + + +def test_tab(): + expr = RegexBuilder().tab() + regex_equality('/\\t/', expr) + regex_compilation('\\t', expr) + + +def test_null_byte(): + expr = RegexBuilder().null_byte() + regex_equality('/\\0/', expr) + regex_compilation('\\0', expr) + + +def test_any_of_basic(): + expr = RegexBuilder().any_of().string('hello').digit().word().char('.').char('#').end() + regex_equality('/(?:hello|\\d|\\w|[\\.\\#])/', expr) + regex_compilation('(?:hello|\\d|\\w|[\\.\\#])', expr) + + +def test_any_of_range_fusion(): + expr = RegexBuilder().any_of().range('a', 'z').range('A', 'Z').range('0', '9').char('.').char('#').end() + regex_equality('/[a-zA-Z0-9\\.\\#]/', expr) + regex_compilation('[a-zA-Z0-9\\.\\#]', expr) + + +def test_any_of_range_fusion_with_other_choices(): + expr = RegexBuilder().any_of().range('a', 'z').range('A', 'Z').range('0', '9').char('.').char('#').string('hello').end() + regex_equality('/(?:hello|[a-zA-Z0-9\\.\\#])/', expr) + regex_compilation('(?:hello|[a-zA-Z0-9\\.\\#])', expr) + + +def test_capture(): + expr = RegexBuilder().capture().string('hello ').word().char('!').end() + regex_equality('/(hello \\w!)/', expr) + regex_compilation('(hello \\w!)', expr) + + +def test_named_capture(): + expr = RegexBuilder().named_capture('this_is_the_name').string('hello ').word().char('!').end() + regex_equality('/(?P<this_is_the_name>hello \\w!)/', expr) + regex_compilation('(?P<this_is_the_name>hello \\w!)', expr) + + +def test_bad_name_error(): + try: + (RegexBuilder().named_capture('hello world').string('hello ').word().char('!').end()) + except Exception as e: + assert isinstance(e, Exception) + + +def test_same_name_error(): + try: + ( + RegexBuilder() + .namedCapture('hello') + .string('hello ') + .word() + .char('!') + .end() + .namedCapture('hello') + .string('hello ') + .word() + .char('!') + .end() + ) + except Exception as e: + assert isinstance(e, Exception) + + +def test_named_back_reference(): + expr = RegexBuilder().named_capture('this_is_the_name').string('hello ').word().char('!').end().named_back_reference('this_is_the_name') + regex_equality('/(?P<this_is_the_name>hello \\w!)\\k<this_is_the_name>/', expr) + # Python does not support named back references, so we raise an error + try: + expr.to_regex() + except Exception as e: + assert isinstance(e, Exception) + + +def test_named_back_reference_no_cg_exists(): + try: + RegexBuilder().named_back_reference('not_here') + except Exception as e: + assert isinstance(e, Exception) + + +def test_back_reference(): + expr = RegexBuilder().capture().string('hello ').word().char('!').end().back_reference(1) + regex_equality('/(hello \\w!)\\1/', expr) + regex_compilation('(hello \\w!)\\1', expr) + + +def test_back_reference_no_cg_exists(): + try: + RegexBuilder().back_reference(1) + except Exception as e: + assert isinstance(e, Exception) + + +def test_group(): + expr = RegexBuilder().group().string('hello ').word().char('!').end() + regex_equality('/(?:hello \\w!)/', expr) + regex_compilation('(?:hello \\w!)', expr) + + +def test_error_when_called_with_no_stack(): + try: + RegexBuilder().end() + except Exception as e: + assert isinstance(e, Exception) + + +def test_assert_ahead(): + expr = RegexBuilder().assert_ahead().range('a', 'f').end().range('a', 'z') + regex_equality('/(?=[a-f])[a-z]/', expr) + regex_compilation('(?=[a-f])[a-z]', expr) + + +def test_assert_behind(): + expr = RegexBuilder().assert_behind().string('hello ').end().range('a', 'z') + regex_equality('/(?<=hello )[a-z]/', expr) + regex_compilation('(?<=hello )[a-z]', expr) + + +def test_assert_not_ahead(): + expr = RegexBuilder().assert_not_ahead().range('a', 'f').end().range('0', '9') + regex_equality('/(?![a-f])[0-9]/', expr) + regex_compilation('(?![a-f])[0-9]', expr) + + +def test_assert_not_behind(): + expr = RegexBuilder().assert_not_behind().string('hello ').end().range('a', 'z') + regex_equality('/(?<!hello )[a-z]/', expr) + regex_compilation('(?<!hello )[a-z]', expr) + + +def test_optional(): + expr = RegexBuilder().optional().word() + regex_equality('/\\w?/', expr) + regex_compilation('\\w?', expr) + + +def test_zero_or_more(): + expr = RegexBuilder().zero_or_more().word() + regex_equality('/\\w*/', expr) + regex_compilation('\\w*', expr) + + +def test_zero_or_more_lazy(): + expr = RegexBuilder().zero_or_more_lazy().word() + regex_equality('/\\w*?/', expr) + regex_compilation('\\w*?', expr) + + +def test_one_or_more(): + expr = RegexBuilder().one_or_more().word() + regex_equality('/\\w+/', expr) + regex_compilation('\\w+', expr) + + +def test_one_or_more_lazy(): + expr = RegexBuilder().one_or_more_lazy().word() + regex_equality('/\\w+?/', expr) + regex_compilation('\\w+?', expr) + + +def test_exactly(): + expr = RegexBuilder().exactly(3).word() + regex_equality('/\\w{3}/', expr) + regex_compilation('\\w{3}', expr) + + +def test_at_least(): + expr = RegexBuilder().at_least(3).word() + regex_equality('/\\w{3,}/', expr) + regex_compilation('\\w{3,}', expr) + + +def test_between(): + expr = RegexBuilder().between(3, 5).word() + regex_equality('/\\w{3,5}/', expr) + regex_compilation('\\w{3,5}', expr) + + +def test_between_lazy(): + expr = RegexBuilder().between_lazy(3, 5).word() + regex_equality('/\\w{3,5}?/', expr) + regex_compilation('\\w{3,5}?', expr) + + +def test_start_of_input(): + expr = RegexBuilder().start_of_input() + regex_equality('/^/', expr) + regex_compilation('^', expr) + + +def test_end_of_input(): + expr = RegexBuilder().end_of_input() + regex_equality('/$/', expr) + regex_compilation('$', expr) + + +def test_any_of_chars(): + expr = RegexBuilder().any_of_chars('aeiou.-') + regex_equality('/[aeiou\\.\\-]/', expr) + regex_compilation('[aeiou\\.\\-]', expr) + + +def test_anything_but_chars(): + expr = RegexBuilder().anything_but_chars('aeiou.-') + regex_equality('/[^aeiou\\.\\-]/', expr) + regex_compilation('[^aeiou\\.\\-]', expr) + + +def test_anything_but_string(): + expr = RegexBuilder().anything_but_string('aeiou.') + regex_equality('/(?:[^a][^e][^i][^o][^u][^\\][^.])/', expr) + regex_compilation('(?:[^a][^e][^i][^o][^u][^\\][^.])', expr) + + +def test_anything_but_range(): + expr = RegexBuilder().anything_but_range('a', 'z') + regex_equality('/[^a-z]/', expr) + regex_compilation('[^a-z]', expr) + expr = RegexBuilder().anything_but_range('0', '9') + regex_equality('/[^0-9]/', expr) + regex_compilation('[^0-9]', expr) + + +def test_string(): + expr = RegexBuilder().string('hello') + regex_equality('/hello/', expr) + regex_compilation('hello', expr) + + +def test_string_escapes_special_chars_with_strings_of_len_1(): + expr = RegexBuilder().string('^').string('hello') + regex_equality('/\\^hello/', expr) + regex_compilation('\\^hello', expr) + + +def test_char(): + expr = RegexBuilder().char('a') + regex_equality('/a/', expr) + regex_compilation('a', expr) + + +def test_char_more_than_one_error(): + try: + RegexBuilder().char('hello') + except Exception as e: + assert isinstance(e, Exception) + + +def test_range(): + expr = RegexBuilder().range('a', 'z') + regex_equality('/[a-z]/', expr) + regex_compilation('[a-z]', expr) + + +def test_must_be_instance_error(): + try: + RegexBuilder().subexpression('nope') + except Exception as e: + assert isinstance(e, Exception) + + +def test_simple_se(): + expr = RegexBuilder().start_of_input().at_least(3).digit().subexpression(simple_se).range('0', '9').end_of_input() + regex_equality('/^\\d{3,}hello.world[0-9]$/', expr) + regex_compilation('^\\d{3,}hello.world[0-9]$', expr) + + +def test_simple_quantified_se(): + expr = RegexBuilder().start_of_input().at_least(3).digit().one_or_more().subexpression(simple_se).range('0', '9').end_of_input() + regex_equality('/^\\d{3,}(?:hello.world)+[0-9]$/', expr) + regex_compilation('^\\d{3,}(?:hello.world)+[0-9]$', expr) + + +def test_flags_se(): + expr = ( + RegexBuilder() + .dot_all() + .start_of_input() + .at_least(3) + .digit() + .subexpression(flags_se, {'ignore_flags': False}) + .range('0', '9') + .end_of_input() + ) + regex_equality('/^\\d{3,}hello.world[0-9]$/IMS', expr) + regex_compilation('^\\d{3,}hello.world[0-9]$', expr, f=re.M | re.I | re.S) + + +def test_flags_se_ignore_flags(): + expr = RegexBuilder().dot_all().start_of_input().at_least(3).digit().subexpression(flags_se).range('0', '9').end_of_input() + regex_equality('/^\\d{3,}hello.world[0-9]$/S', expr) + regex_compilation('^\\d{3,}hello.world[0-9]$', expr, f=re.S) + + +def test_ignore_start_and_end(): + expr = RegexBuilder().at_least(3).digit().subexpression(start_end_se).range('0', '9') + regex_equality('/\\d{3,}hello.world[0-9]/', expr) + regex_compilation('\\d{3,}hello.world[0-9]', expr) + + +def test_dont_ignore_start_and_end(): + try: + (RegexBuilder().at_least(3).digit().subexpression(start_end_se, {'ignore_start_and_end': False}).range('0', '9')) + except Exception as e: + assert isinstance(e, Exception) + + +def test_dont_ignore_start_and_end2(): + try: + se = RegexBuilder().start_of_input().string('hello').any_char().string('world') + (RegexBuilder().at_least(3).digit().subexpression(se, {'ignore_start_and_end': False}).range('0', '9')) + except Exception as e: + assert isinstance(e, Exception) + + +def test_dont_ignore_start_and_end3(): + try: + se = RegexBuilder().string('hello').any_char().string('world').end_of_input() + (RegexBuilder().at_least(3).digit().subexpression(se, {'ignore_start_and_end': False}).range('0', '9')) + except Exception as e: + assert isinstance(e, Exception) + + +def test_start_defined_in_me_and_se(): + try: + (RegexBuilder().start_of_input().at_least(3).digit().subexpression(start_end_se, {'ignore_start_and_end': False}).range('0', '9')) + except Exception as e: + assert isinstance(e, Exception) + + +def test_end_defined_in_me_and_se(): + try: + (RegexBuilder().at_least(3).digit().subexpression(start_end_se, {'ignore_start_and_end': False}).range('0', '9').end_of_input()) + except Exception as e: + assert isinstance(e, Exception) + + +def test_no_namespacing(): + expr = RegexBuilder().at_least(3).digit().subexpression(nc_se).range('0', '9') + regex_equality('/\\d{3,}(?P<module>.{2})\\k<module>[0-9]/', expr) + try: + expr.to_regex() + except Exception as e: + assert isinstance(e, Exception) + + +def test_namespacing(): + expr = RegexBuilder().at_least(3).digit().subexpression(nc_se, {'namespace': 'yolo'}).range('0', '9') + regex_equality('/\\d{3,}(?P<yolomodule>.{2})\\k<yolomodule>[0-9]/', expr) + try: + expr.to_regex() + except Exception as e: + assert isinstance(e, Exception) + + +def test_group_name_collision_error(): + try: + (RegexBuilder().namedCapture('module').at_least(3).digit().end().subexpression(nc_se).range('0', '9')) + except Exception as e: + assert isinstance(e, Exception) + + +def test_group_name_collision_error_after_namespacing(): + try: + (RegexBuilder().namedCapture('module').at_least(3).digit().end().subexpression(nc_se, {'namespace': 'yolo'}).range('0', '9')) + except Exception as e: + assert isinstance(e, Exception) + + +def test_indexed_back_referencing(): + expr = RegexBuilder().capture().at_least(3).digit().end().subexpression(indexed_back_reference_se).back_reference(1).range('0', '9') + regex_equality('/(\\d{3,})(.{2})\\2\\1[0-9]/', expr) + regex_compilation('(\\d{3,})(.{2})\\2\\1[0-9]', expr) + + +def test_deeply_nested_se(): + expr = RegexBuilder().capture().at_least(3).digit().end().subexpression(first_layer_se).back_reference(1).range('0', '9') + regex_equality('/(\\d{3,})outer begin(?P<inner_subexpression>(?:.{2})?)outer end\\1[0-9]/', expr) + regex_compilation('(\\d{3,})outer begin(?P<inner_subexpression>(?:.{2})?)outer end\\1[0-9]', expr) diff --git a/tests/test_edify.py b/tests/test_edify.py deleted file mode 100644 index bb81dbc..0000000 --- a/tests/test_edify.py +++ /dev/null @@ -1,5 +0,0 @@ -from edify.main import main - - -def test_main(): - assert main() == 0 diff --git a/tests/test_email.py b/tests/test_email.py new file mode 100644 index 0000000..2c90c52 --- /dev/null +++ b/tests/test_email.py @@ -0,0 +1,56 @@ +from edify.library import email + + +def test(): + emails = [ + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "[email protected].", + "plainaddress", + "#@%^%#$@#$@#.com", + "@example.com", + "Joe Smith <[email protected]>", + "email.example.com", + "email@[email protected]", + "[email protected]", + "[email protected]", + "[email protected]", + "あいうえお@example.com", + "[email protected]", + ] + expectations = [ + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False + ] + for i in range(len(emails)): + assert email(emails[i]) == expectations[i] diff --git a/tests/test_phone.py b/tests/test_phone.py new file mode 100644 index 0000000..81168ab --- /dev/null +++ b/tests/test_phone.py @@ -0,0 +1,25 @@ +from edify.library import phone_number + + +def test(): + phones = { + "1234567890": True, + "123 456 7890": True, + "123-456-7890": True, + "123.456.7890": True, + "123 456 7890": True, + "+1 (123) 456-7890": True, + "+1 (123) 456 7890": True, + "+1 (123) 456-7890": True, + "+102 (123) 456-7890": True, + "+91 (123) 456-7890": True, + "90122121": True, + "12345678901": True, + "+1 (124) 232": True, + "+1 (123) 45-890": True, + "+1 (1) 456-7890": True, + "9012": False, + "+1 (615) 243-": False + } + for phone, expectation in phones.items(): + assert phone_number(phone) == expectation @@ -14,7 +14,7 @@ envlist = clean, check, docs, - {py36,py37,py38,py39,py310,pypy37,pypy38}, + {py37,py38,py39,py310,pypy37,pypy38}, report ignore_basepython_conflict = true @@ -22,7 +22,6 @@ ignore_basepython_conflict = true basepython = pypy37: {env:TOXPYTHON:pypy3.7} pypy38: {env:TOXPYTHON:pypy3.8} - py36: {env:TOXPYTHON:python3.6} py37: {env:TOXPYTHON:python3.7} py38: {env:TOXPYTHON:python3.8} py39: {env:TOXPYTHON:python3.9} |
