diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml new file mode 100644 index 0000000..dff64be --- /dev/null +++ b/.github/workflows/build-tests.yml @@ -0,0 +1,14 @@ +name: Build Tests + +on: + pull_request: + branches: [dev, master, main] + workflow_dispatch: + +jobs: + build: + uses: OpenVoiceOS/gh-automations/.github/workflows/build-tests.yml@dev + with: + python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]' + install_extras: 'test' + test_path: 'test' diff --git a/.github/workflows/build_tests.yml b/.github/workflows/build_tests.yml deleted file mode 100644 index f204bb7..0000000 --- a/.github/workflows/build_tests.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Run Build Tests -on: - push: - workflow_dispatch: - -jobs: - build_tests: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - ref: ${{ github.head_ref }} - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Install Build Tools - run: | - python -m pip install build wheel - - name: Install System Dependencies - run: | - sudo apt-get update - sudo apt install python3-dev swig libssl-dev libfann-dev portaudio19-dev libpulse-dev - - name: Build Source Packages - run: | - python setup.py sdist - - name: Build Distribution Packages - run: | - python setup.py bdist_wheel - - name: Install tflite_runtime workaround tflit bug - run: | - pip3 install numpy - pip3 install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime - - name: Install core repo - run: | - pip install .[audio-backend,mark1,stt,tts,skills_minimal,skills,gui,bus,all] diff --git a/.github/workflows/conventional-label.yaml b/.github/workflows/conventional-label.yml similarity index 77% rename from .github/workflows/conventional-label.yaml rename to .github/workflows/conventional-label.yml index 0a449cb..9894c1b 100644 --- a/.github/workflows/conventional-label.yaml +++ b/.github/workflows/conventional-label.yml @@ -7,4 +7,4 @@ jobs: label: runs-on: ubuntu-latest steps: - - uses: bcoe/conventional-release-labels@v1 \ No newline at end of file + - uses: bcoe/conventional-release-labels@v1 diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 0000000..4528f7e --- /dev/null +++ b/.github/workflows/coverage.yml @@ -0,0 +1,16 @@ +name: Code Coverage + +on: + pull_request: + branches: [dev] + workflow_dispatch: + +jobs: + coverage: + uses: OpenVoiceOS/gh-automations/.github/workflows/coverage.yml@dev + with: + python_version: '3.11' + coverage_source: 'padacioso' + test_path: 'test/' + install_extras: '.[test]' + min_coverage: 0 diff --git a/.github/workflows/install_tests.yml b/.github/workflows/install_tests.yml deleted file mode 100644 index 4aaabea..0000000 --- a/.github/workflows/install_tests.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: Run Install Tests -on: - push: - branches: - - master - - dev - workflow_dispatch: - -jobs: - install: - strategy: - max-parallel: 2 - matrix: - python-version: [ 3.7, 3.8, 3.9, "3.10" ] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - name: Install Build Tools - run: | - python -m pip install build wheel - - name: Install System Dependencies - run: | - sudo apt-get update - sudo apt install python3-dev swig libssl-dev - - name: Build Distribution Packages - run: | - python setup.py bdist_wheel - - name: Install package - run: | - pip install .[all] \ No newline at end of file diff --git a/.github/workflows/license_check.yml b/.github/workflows/license_check.yml new file mode 100644 index 0000000..214edaa --- /dev/null +++ b/.github/workflows/license_check.yml @@ -0,0 +1,10 @@ +name: License Check + +on: + pull_request: + branches: [dev] + workflow_dispatch: + +jobs: + license_check: + uses: OpenVoiceOS/gh-automations/.github/workflows/license-check.yml@dev diff --git a/.github/workflows/license_tests.yml b/.github/workflows/license_tests.yml deleted file mode 100644 index 7d0c4f6..0000000 --- a/.github/workflows/license_tests.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: Run License Tests -on: - push: - workflow_dispatch: - pull_request: - branches: - - master -jobs: - license_tests: - uses: neongeckocom/.github/.github/workflows/license_tests.yml@master diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..0cb9564 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,13 @@ +name: Lint + +on: + pull_request: + branches: [dev, master, main] + workflow_dispatch: + +jobs: + lint: + uses: OpenVoiceOS/gh-automations/.github/workflows/lint.yml@dev + with: + ruff: true + pre_commit: false # set true if .pre-commit-config.yaml exists diff --git a/.github/workflows/opm-check.yml b/.github/workflows/opm-check.yml new file mode 100644 index 0000000..842e21d --- /dev/null +++ b/.github/workflows/opm-check.yml @@ -0,0 +1,19 @@ +name: OPM Plugin Check + +on: + pull_request: + branches: [dev, master, main] + workflow_dispatch: + +jobs: + opm_check: + uses: OpenVoiceOS/gh-automations/.github/workflows/opm-check.yml@dev + with: + python_version: '3.11' + install_extras: 'extras' + plugin_type: 'auto' + entry_point: '"ovos-padacioso-pipeline-plugin"' + opm_require_found: true + opm_validate_interface: true + opm_test_import: true + opm_perf_threshold_ms: 500 diff --git a/.github/workflows/pip_audit.yml b/.github/workflows/pip_audit.yml new file mode 100644 index 0000000..131320d --- /dev/null +++ b/.github/workflows/pip_audit.yml @@ -0,0 +1,10 @@ +name: PIP Audit + +on: + pull_request: + branches: [dev] + workflow_dispatch: + +jobs: + pip_audit: + uses: OpenVoiceOS/gh-automations/.github/workflows/pip-audit.yml@dev diff --git a/.github/workflows/publish_stable.yml b/.github/workflows/publish_stable.yml index 4e6128f..f9aee05 100644 --- a/.github/workflows/publish_stable.yml +++ b/.github/workflows/publish_stable.yml @@ -1,58 +1,23 @@ -name: Stable Release +name: Publish Stable Release + on: - push: - branches: [master] workflow_dispatch: + push: + branches: [master, main] + +permissions: + contents: write # required for version bump commit and release tag jobs: publish_stable: - uses: TigreGotico/gh-automations/.github/workflows/publish-stable.yml@master - secrets: inherit + if: github.actor != 'github-actions[bot]' + uses: OpenVoiceOS/gh-automations/.github/workflows/publish-stable.yml@dev + secrets: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + MATRIX_TOKEN: ${{ secrets.MATRIX_TOKEN }} with: - branch: 'master' version_file: 'padacioso/version.py' - setup_py: 'setup.py' + publish_pypi: true publish_release: true - - publish_pypi: - needs: publish_stable - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - ref: dev - fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Install Build Tools - run: | - python -m pip install build wheel - - name: version - run: echo "::set-output name=version::$(python setup.py --version)" - id: version - - name: Build Distribution Packages - run: | - python setup.py sdist bdist_wheel - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{secrets.PYPI_TOKEN}} - - - sync_dev: - needs: publish_stable - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - ref: master - - name: Push master -> dev - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - branch: dev \ No newline at end of file + sync_dev: true + notify_matrix: true diff --git a/.github/workflows/release-preview.yml b/.github/workflows/release-preview.yml new file mode 100644 index 0000000..fdcb46e --- /dev/null +++ b/.github/workflows/release-preview.yml @@ -0,0 +1,13 @@ +name: Release Preview + +on: + pull_request: + branches: [dev] + workflow_dispatch: + +jobs: + release_preview: + uses: OpenVoiceOS/gh-automations/.github/workflows/release-preview.yml@dev + with: + package_name: 'padacioso' + version_file: 'padacioso/version.py' diff --git a/.github/workflows/release_workflow.yml b/.github/workflows/release_workflow.yml index 0ff4764..82ec027 100644 --- a/.github/workflows/release_workflow.yml +++ b/.github/workflows/release_workflow.yml @@ -1,108 +1,28 @@ name: Release Alpha and Propose Stable on: + workflow_dispatch: pull_request: types: [closed] branches: [dev] +permissions: + contents: write + pull-requests: write + jobs: publish_alpha: - if: github.event.pull_request.merged == true - uses: TigreGotico/gh-automations/.github/workflows/publish-alpha.yml@master - secrets: inherit + if: github.event.pull_request.merged == true || github.event_name == 'workflow_dispatch' + uses: OpenVoiceOS/gh-automations/.github/workflows/publish-alpha.yml@dev + secrets: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + MATRIX_TOKEN: ${{ secrets.MATRIX_TOKEN }} with: branch: 'dev' version_file: 'padacioso/version.py' - setup_py: 'setup.py' update_changelog: true publish_prerelease: true - changelog_max_issues: 100 - - notify: - if: github.event.pull_request.merged == true - needs: publish_alpha - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Send message to Matrix bots channel - id: matrix-chat-message - uses: fadenb/matrix-chat-message@v0.0.6 - with: - homeserver: 'matrix.org' - token: ${{ secrets.MATRIX_TOKEN }} - channel: '!WjxEKjjINpyBRPFgxl:krbel.duckdns.org' - message: | - new ${{ github.event.repository.name }} PR merged! https://github.com/${{ github.repository }}/pull/${{ github.event.number }} - - publish_pypi: - needs: publish_alpha - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - ref: dev - fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Install Build Tools - run: | - python -m pip install build wheel - - name: version - run: echo "::set-output name=version::$(python setup.py --version)" - id: version - - name: Build Distribution Packages - run: | - python setup.py sdist bdist_wheel - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{secrets.PYPI_TOKEN}} - - - propose_release: - needs: publish_alpha - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - name: Checkout dev branch - uses: actions/checkout@v3 - with: - ref: dev - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: '3.10' - - - name: Get version from setup.py - id: get_version - run: | - VERSION=$(python setup.py --version) - echo "VERSION=$VERSION" >> $GITHUB_ENV - - - name: Create and push new branch - run: | - git checkout -b release-${{ env.VERSION }} - git push origin release-${{ env.VERSION }} - - - name: Open Pull Request from dev to master - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Variables - BRANCH_NAME="release-${{ env.VERSION }}" - BASE_BRANCH="master" - HEAD_BRANCH="release-${{ env.VERSION }}" - PR_TITLE="Release ${{ env.VERSION }}" - PR_BODY="Human review requested!" - - # Create a PR using GitHub API - curl -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: token $GITHUB_TOKEN" \ - -d "{\"title\":\"$PR_TITLE\",\"body\":\"$PR_BODY\",\"head\":\"$HEAD_BRANCH\",\"base\":\"$BASE_BRANCH\"}" \ - https://api.github.com/repos/${{ github.repository }}/pulls - + propose_release: true + changelog_max_issues: 50 + publish_pypi: true + notify_matrix: true diff --git a/.github/workflows/repo-health.yml b/.github/workflows/repo-health.yml new file mode 100644 index 0000000..b538624 --- /dev/null +++ b/.github/workflows/repo-health.yml @@ -0,0 +1,12 @@ +name: Repo Health + +on: + pull_request: + branches: [dev, master, main] + workflow_dispatch: + +jobs: + repo_health: + uses: OpenVoiceOS/gh-automations/.github/workflows/repo-health.yml@dev + with: + version_file: 'padacioso/version.py' diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml deleted file mode 100644 index 10a0954..0000000 --- a/.github/workflows/unit_tests.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: Run UnitTests -on: - pull_request: - branches: - - dev - push: - workflow_dispatch: - -jobs: - py_build_tests: - uses: neongeckocom/.github/.github/workflows/python_build_tests.yml@master - unit_tests: - strategy: - max-parallel: 2 - matrix: - python-version: [ 3.7, 3.8, 3.9, '3.10' ] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Set up python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install System Dependencies - run: | - python -m pip install build wheel - - name: Install repo - run: | - pip install . - - name: Install test dependencies - run: | - pip install pytest pytest-timeout pytest-cov - - name: Run unittests - run: | - pytest --cov=padacioso --cov-report xml test/test_padacioso.py - # NOTE: additional pytest invocations should also add the --cov-append flag - # or they will overwrite previous invocations' coverage reports - # (for an example, see OVOS Skill Manager's workflow) - - name: Upload coverage - env: - CODECOV_TOKEN: ${{secrets.CODECOV_TOKEN}} - uses: codecov/codecov-action@v3 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..091f922 --- /dev/null +++ b/.gitignore @@ -0,0 +1,216 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ae4280..e313c50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,45 @@ # Changelog -## [1.0.0a1](https://github.com/OpenVoiceOS/padacioso/tree/1.0.0a1) (2024-10-16) +## [1.0.2a1](https://github.com/OpenVoiceOS/padacioso/tree/1.0.2a1) (2026-04-21) -[Full Changelog](https://github.com/OpenVoiceOS/padacioso/compare/0.2.4...1.0.0a1) +[Full Changelog](https://github.com/OpenVoiceOS/padacioso/compare/1.0.1a4...1.0.2a1) -**Breaking changes:** +**Merged pull requests:** -- feat!:pipeline factory [\#29](https://github.com/OpenVoiceOS/padacioso/pull/29) ([JarbasAl](https://github.com/JarbasAl)) +- fix: normalize whitespace and apostrophes for training data and inference queries [\#44](https://github.com/OpenVoiceOS/padacioso/pull/44) ([JarbasAl](https://github.com/JarbasAl)) + +## [1.0.1a4](https://github.com/OpenVoiceOS/padacioso/tree/1.0.1a4) (2025-12-19) + +[Full Changelog](https://github.com/OpenVoiceOS/padacioso/compare/1.0.1a3...1.0.1a4) + +**Merged pull requests:** + +- chore\(deps\): update dependency python to 3.14 [\#37](https://github.com/OpenVoiceOS/padacioso/pull/37) ([renovate[bot]](https://github.com/apps/renovate)) + +## [1.0.1a3](https://github.com/OpenVoiceOS/padacioso/tree/1.0.1a3) (2025-12-18) + +[Full Changelog](https://github.com/OpenVoiceOS/padacioso/compare/1.0.1a2...1.0.1a3) + +**Merged pull requests:** + +- chore: Configure Renovate [\#36](https://github.com/OpenVoiceOS/padacioso/pull/36) ([renovate[bot]](https://github.com/apps/renovate)) + +## [1.0.1a2](https://github.com/OpenVoiceOS/padacioso/tree/1.0.1a2) (2025-11-10) + +[Full Changelog](https://github.com/OpenVoiceOS/padacioso/compare/1.0.1a1...1.0.1a2) + +**Merged pull requests:** + +- Update ovos-plugin-manager requirement from \<2.0.0,\>=0.5.0 to \>=0.5.0,\<3.0.0 [\#34](https://github.com/OpenVoiceOS/padacioso/pull/34) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: padacioso speed [\#33](https://github.com/OpenVoiceOS/padacioso/pull/33) ([mikejgray](https://github.com/mikejgray)) + +## [1.0.1a1](https://github.com/OpenVoiceOS/padacioso/tree/1.0.1a1) (2025-06-16) + +[Full Changelog](https://github.com/OpenVoiceOS/padacioso/compare/1.0.0...1.0.1a1) + +**Merged pull requests:** + +- Update ovos-plugin-manager requirement from \<1.0.0,\>=0.5.0 to \>=0.5.0,\<2.0.0 [\#31](https://github.com/OpenVoiceOS/padacioso/pull/31) ([dependabot[bot]](https://github.com/apps/dependabot)) diff --git a/extras.txt b/extras.txt old mode 100755 new mode 100644 index 857e1ca..2c44600 --- a/extras.txt +++ b/extras.txt @@ -1,3 +1,3 @@ -ovos-plugin-manager>=0.5.0,<1.0.0 +ovos-plugin-manager>=0.5.0,<3.0.0 ovos-utils>=0.3.5,<1.0.0 langcodes \ No newline at end of file diff --git a/padacioso/__init__.py b/padacioso/__init__.py index 82dbd6a..86fcf00 100644 --- a/padacioso/__init__.py +++ b/padacioso/__init__.py @@ -1,9 +1,8 @@ -import concurrent.futures from typing import List, Iterator, Optional import simplematch -from padacioso.bracket_expansion import expand_parentheses, normalize_example +from padacioso.bracket_expansion import expand_parentheses, normalize_example, normalize_utterance, _space_entities try: from ovos_utils.log import LOG @@ -11,11 +10,10 @@ except ImportError: import logging - LOG = logging.getLogger('padacioso') + LOG = logging.getLogger("padacioso") from difflib import SequenceMatcher - def fuzzy_match(x, against): """Perform a 'fuzzy' comparison between two strings. Returns: @@ -38,6 +36,10 @@ def __init__(self, fuzz=False, n_workers=4): self.excluded_keywords = {} self.excluded_contexts = {} + # Cache for optimization - pre-built list for fast iteration + self._intent_list = [] # Pre-built list of (intent_name, regexes) + self._cache_dirty = True # Flag to rebuild cache on next query + if "word" not in simplematch.types: LOG.debug(f"Registering `word` type") _init_sm_word_type() @@ -67,19 +69,18 @@ def add_intent(self, name: str, lines: List[str]): @param lines: list of intent regexes """ if name in self.intent_samples: - raise RuntimeError(f"Attempted to re-register existing intent: " - f"{name}") + raise RuntimeError(f"Attempted to re-register existing intent: {name}") expanded = [] for l in lines: - expanded += expand_parentheses(normalize_example(l)) + for e in expand_parentheses(normalize_example(l)): + expanded.append(normalize_utterance(_space_entities(e))) regexes = list(set(expanded)) regexes.sort(key=len, reverse=True) self.intent_samples[name] = regexes for r in regexes: - self._cased_matchers[r] = \ - simplematch.Matcher(r, case_sensitive=True) - self._uncased_matchers[r] = \ - simplematch.Matcher(r, case_sensitive=False) + self._cased_matchers[r] = simplematch.Matcher(r, case_sensitive=True) + self._uncased_matchers[r] = simplematch.Matcher(r, case_sensitive=False) + self._cache_dirty = True # Mark cache as needing rebuild def remove_intent(self, name: str): """ @@ -93,6 +94,7 @@ def remove_intent(self, name: str): self._cased_matchers.pop(rx) if rx in self._uncased_matchers: self._uncased_matchers.pop(rx) + self._cache_dirty = True # Mark cache as needing rebuild def add_entity(self, name: str, lines: List[str]): """ @@ -101,13 +103,13 @@ def add_entity(self, name: str, lines: List[str]): @param lines: list of entity examples """ if name in self.entity_samples: - raise RuntimeError(f"Attempted to re-register existing entity: " - f"{name}") + raise RuntimeError(f"Attempted to re-register existing entity: {name}") name = name.lower() expanded = [] for l in lines: expanded += expand_parentheses(l) self.entity_samples[name] = expanded + self._cache_dirty = True # Mark cache as needing rebuild def remove_entity(self, name: str): """ @@ -118,6 +120,15 @@ def remove_entity(self, name: str): if name in self.entity_samples: del self.entity_samples[name] + def _rebuild_cache(self): + """ + Rebuild cached intent metadata for fast filtering. + Called lazily on first query after registration to avoid O(n²) during bulk registration. + """ + # Pre-build the intent list to avoid reconstructing it every query + self._intent_list = list(self.intent_samples.items()) + self._cache_dirty = False + def _filter(self, query: str): # filter intents based on context/excluded keywords excluded_intents = [] @@ -127,14 +138,12 @@ def _filter(self, query: str): for intent_name, contexts in self.required_contexts.items(): if intent_name not in self.available_contexts: excluded_intents.append(intent_name) - elif any(context not in self.available_contexts[intent_name] - for context in contexts): + elif any(context not in self.available_contexts[intent_name] for context in contexts): excluded_intents.append(intent_name) for intent_name, contexts in self.excluded_contexts.items(): if intent_name not in self.available_contexts: continue - if any(context in self.available_contexts[intent_name] - for context in contexts): + if any(context in self.available_contexts[intent_name] for context in contexts): excluded_intents.append(intent_name) return excluded_intents @@ -146,8 +155,7 @@ def _match(self, query, intent_name, regexes): penalty = 0.15 if r not in self._cased_matchers: LOG.warning(f"{r} not initialized") - self._cased_matchers[r] = \ - simplematch.Matcher(r, case_sensitive=True) + self._cased_matchers[r] = simplematch.Matcher(r, case_sensitive=True) entities = self._cased_matchers[r].match(query) if entities is not None: for k, v in entities.items(): @@ -157,14 +165,11 @@ def _match(self, query, intent_name, regexes): elif str(v) not in self.entity_samples[k]: # penalize parsed entity value not in samples penalty += 0.1 - return {"entities": entities or {}, - "conf": 1 - penalty, - "name": intent_name} + return {"entities": entities or {}, "conf": 1 - penalty, "name": intent_name} if r not in self._uncased_matchers: LOG.warning(f"{r} not initialized") - self._uncased_matchers[r] = \ - simplematch.Matcher(r, case_sensitive=False) + self._uncased_matchers[r] = simplematch.Matcher(r, case_sensitive=False) entities = self._uncased_matchers[r].match(query) if entities is not None: # penalize case mismatch @@ -176,9 +181,7 @@ def _match(self, query, intent_name, regexes): elif str(v) not in self.entity_samples[k]: # penalize parsed entity value not in samples penalty += 0.1 - return {"entities": entities or {}, - "conf": 1 - penalty, - "name": intent_name} + return {"entities": entities or {}, "conf": 1 - penalty, "name": intent_name} if self.fuzz: for r in regexes: @@ -205,8 +208,7 @@ def _fuzzy_score(self, query, s, penalty=0.25): score = (fuzzy_score + base_score) / 2 if entities is not None: - return {"entities": entities or {}, - "conf": (fuzzy_score + base_score) / 2} + return {"entities": entities or {}, "conf": score} def calc_intents(self, query: str) -> Iterator[dict]: """ @@ -214,19 +216,27 @@ def calc_intents(self, query: str) -> Iterator[dict]: @param query: input to evaluate for an intent match @return: yields dict intent matches """ - # filter intents based on context/excluded keywords + query = normalize_utterance(query) + + # Lazy cache rebuild - only rebuild once after bulk registration + # This avoids O(n²) scaling during registration (rebuild on every add) + if self._cache_dirty: + self._rebuild_cache() + + # Filter based on runtime context/keywords (query and session dependent) excluded_intents = self._filter(query) - # do the work in parallel instead of sequentially - with concurrent.futures.ProcessPoolExecutor(max_workers=self.workers) as executor: - future_to_source = { - executor.submit(self._match, query, intent_name, regexes): intent_name - for intent_name, regexes in self.intent_samples.items() if intent_name not in excluded_intents - } - for future in concurrent.futures.as_completed(future_to_source): - res = future.result() - if res is not None: - yield res + # Sequential processing - threading overhead > actual work for regex matching + for intent_name, regexes in self._intent_list: + if intent_name in excluded_intents: + continue + res = self._match(query, intent_name, regexes) + if res is not None: + yield res + # Early exit optimization: perfect match found + # TODO: Some validation that we don't have duplicates, and warning if we do + if res.get("conf", 0) == 1.0: + return def calc_intent(self, query: str) -> Optional[dict]: """ @@ -234,7 +244,7 @@ def calc_intent(self, query: str) -> Optional[dict]: @param query: input to evaluate for an intent @return: dict matched intent (or None) """ - match = {'name': None, 'entities': {}} + match = {"name": None, "entities": {}} intents = [i for i in self.calc_intents(query) if i is not None and i.get("name")] if len(intents) == 0: LOG.info("No match") @@ -249,9 +259,9 @@ def calc_intent(self, query: str) -> Optional[dict]: match = ties[0] - for entity in set(match['entities'].keys()): - entities = match['entities'].pop(entity) - match['entities'][entity.lower()] = entities + for entity in set(match["entities"].keys()): + entities = match["entities"].pop(entity) + match["entities"][entity.lower()] = entities LOG.debug(match) return match @@ -260,6 +270,7 @@ def exclude_keywords(self, intent_name, samples): self.excluded_keywords[intent_name] = samples else: self.excluded_keywords[intent_name] += samples + self._cache_dirty = True # Mark cache as needing rebuild def set_context(self, intent_name, context_name, context_val=None): if intent_name not in self.available_contexts: @@ -271,11 +282,12 @@ def exclude_context(self, intent_name, context_name): self.excluded_contexts[intent_name] = [context_name] else: self.excluded_contexts[intent_name].append(context_name) + self._cache_dirty = True # Mark cache as needing rebuild def unexclude_context(self, intent_name, context_name): if intent_name in self.excluded_contexts: - self.excluded_contexts[intent_name] = [c for c in self.excluded_contexts[intent_name] - if context_name != c] + self.excluded_contexts[intent_name] = [c for c in self.excluded_contexts[intent_name] if context_name != c] + self._cache_dirty = True # Mark cache as needing rebuild def unset_context(self, intent_name, context_name): if intent_name in self.available_contexts: @@ -287,11 +299,12 @@ def require_context(self, intent_name, context_name): self.required_contexts[intent_name] = [context_name] else: self.required_contexts[intent_name].append(context_name) + self._cache_dirty = True # Mark cache as needing rebuild def unrequire_context(self, intent_name, context_name): if intent_name in self.required_contexts: - self.required_contexts[intent_name] = [c for c in self.required_contexts[intent_name] - if context_name != c] + self.required_contexts[intent_name] = [c for c in self.required_contexts[intent_name] if context_name != c] + self._cache_dirty = True # Mark cache as needing rebuild def _init_sm_word_type(): diff --git a/padacioso/bracket_expansion.py b/padacioso/bracket_expansion.py index e49b268..6d6d519 100644 --- a/padacioso/bracket_expansion.py +++ b/padacioso/bracket_expansion.py @@ -1,190 +1,44 @@ -class TreeFragment: - """(Abstract) empty sentence fragment""" +import itertools +import re - def __init__(self, tree): - """ - Construct a sentence tree fragment which is merely a wrapper for - a list of Strings - Args: - tree (?): Base tree for the sentence fragment, type depends on - subclass, refer to those subclasses - """ - self._tree = tree - - def tree(self): - """Return the represented sentence tree as raw data.""" - return self._tree - - def expand(self): - """ - Expanded version of the fragment. In this case an empty sentence. - - Returns: - List>: A list with an empty sentence (= token/string list) - """ - return [[]] - - def __str__(self): - return self._tree.__str__() - - def __repr__(self): - return self._tree.__repr__() - - -class Word(TreeFragment): - """ - Single word in the sentence tree. - - Construct with a string as argument. +def expand_parentheses(sent: str) -> list: """ + Expand a template string with (a|b) alternatives and [optional] syntax + into all possible combinations. - def expand(self): - """ - Creates one sentence that contains exactly that word. - - Returns: - List>: A list with the given string as sentence - (= token/string list) - """ - return [[self._tree]] - - -class Sentence(TreeFragment): - """ - A Sentence made of several concatenations/words. - - Construct with a List as argument. - """ - - def expand(self): - """ - Creates a combination of all sub-sentences. - - Returns: - List>: A list with all subsentence expansions combined in - every possible way - """ - old_expanded = [[]] - for sub in self._tree: - sub_expanded = sub.expand() - new_expanded = [] - while len(old_expanded) > 0: - sentence = old_expanded.pop() - for new in sub_expanded: - new_expanded.append(sentence + new) - old_expanded = new_expanded - return old_expanded - - -class SentenceTree(TreeFragment): - """ - A Combination of possible sub-sentences. - - Construct with List as argument. - """ - - def expand(self): - """ - Returns all of its options as seperated sub-sentences. - - Returns: - List>: A list containing the sentences created by all - expansions of its sub-sentences - """ - options = [] - for option in self._tree: - options.extend(option.expand()) - return options - - -class SentenceTreeParser: + Examples: + "Will it (rain|pour) [today]?" -> + ["Will it rain today?", "Will it rain?", + "Will it pour today?", "Will it pour?"] """ - Generate sentence token trees from a list of sentence - ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] - """ - - def __init__(self, sentence): - # the syntax for .optionally is square brackets - # "hello [world]" - # this is equivalent to using .one_of - # "hello (world|) - sentence = sentence.replace("[", "(").replace("]", "|)") - self.sentence = sentence - - def _parse(self): - """ - Generate sentence token trees - ['1', '(', '2', '|', '3, ')'] -> ['1', ['2', '3']] - """ - self._current_position = 0 - return self._parse_expr() - - def _parse_expr(self): - """ - Generate sentence token trees from the current position to - the next closing parentheses / end of the list and return it - ['1', '(', '2', '|', '3, ')'] -> ['1', [['2'], ['3']]] - ['2', '|', '3'] -> [['2'], ['3']] - """ - # List of all generated sentences - sentence_list = [] - # Currently active sentence - cur_sentence = [] - sentence_list.append(Sentence(cur_sentence)) - # Determine which form the current expression has - while self._current_position < len(self.sentence): - cur = self.sentence[self._current_position] - self._current_position += 1 - if cur == '(': - # Parse the subexpression - subexpr = self._parse_expr() - # Check if the subexpression only has one branch - # -> If so, append "(" and ")" and add it as is - normal_brackets = False - if len(subexpr.tree()) == 1: - normal_brackets = True - cur_sentence.append(Word('(')) - # add it to the sentence - cur_sentence.append(subexpr) - if normal_brackets: - cur_sentence.append(Word(')')) - elif cur == '|': - # Begin parsing a new sentence - cur_sentence = [] - sentence_list.append(Sentence(cur_sentence)) - elif cur == ')': - # End parsing the current subexpression - break - # TODO anything special about {sth}? + def _expand_optional(text): + return re.sub(r"\[([^\[\]]+)\]", lambda m: f"({m.group(1)}|)", text) + + def _expand_alternatives(text): + parts = [] + for segment in re.split(r"(\([^\(\)]+\))", text): + if segment.startswith("(") and segment.endswith(")"): + parts.append(segment[1:-1].split("|")) else: - cur_sentence.append(Word(cur)) - return SentenceTree(sentence_list) + parts.append([segment]) + return itertools.product(*parts) + + def _fully_expand(texts): + result = set(texts) + while True: + expanded = set() + for text in result: + for combo in _expand_alternatives(text): + # collapse internal whitespace so the empty branch of + # [optional] doesn't leave a double space + expanded.add(re.sub(r' +', ' ', "".join(combo)).strip()) + if expanded == result: + break + result = expanded + return sorted(result) - def expand_parentheses(self): - tree = self._parse() - return tree.expand() - - -def expand_parentheses(sent): - """ - ['1', '(', '2', '|', '3, ')'] -> [['1', '2'], ['1', '3']] - For example: - Will it (rain|pour) (today|tomorrow|)? - ----> - Will it rain today? - Will it rain tomorrow? - Will it rain? - Will it pour today? - Will it pour tomorrow? - Will it pour? - Args: - sent (list): List of sentence in sentence - Returns: - list>: Multiple possible sentences from original - """ - expanded = SentenceTreeParser(sent).expand_parentheses() - return ["".join(_).strip() for _ in expanded] + return _fully_expand([_expand_optional(sent)]) def clean_braces(example: str) -> str: @@ -214,5 +68,62 @@ def translate_padatious(example: str) -> str: return " ".join(tokens) +def normalize_whitespace(text: str) -> str: + """ + Collapse multiple consecutive whitespace characters into a single space + and strip leading/trailing whitespace. + @param text: input text + @return: whitespace-normalized text + """ + return re.sub(r'\s+', ' ', text).strip() + + +def drop_apostrophes(text: str) -> str: + """ + Replace apostrophes and common apostrophe-like unicode variants with a space. + Using a space rather than empty string preserves word boundaries so that + "it's" -> "it s" and both sides of a match reduce the same way. + @param text: input text + @return: text with all apostrophe variants replaced by a space + """ + apostrophe_variants = [ + "'", # U+0027 ASCII apostrophe + "’", # U+2019 RIGHT SINGLE QUOTATION MARK + "‘", # U+2018 LEFT SINGLE QUOTATION MARK + "ʼ", # U+02BC MODIFIER LETTER APOSTROPHE + "ʹ", # U+02B9 MODIFIER LETTER PRIME + "`", # U+0060 GRAVE ACCENT (backtick) + "´", # U+00B4 ACUTE ACCENT + "'", # U+FF07 FULLWIDTH APOSTROPHE + ] + for variant in apostrophe_variants: + text = text.replace(variant, " ") + return text + + +def _space_entities(text: str) -> str: + """ + Ensure a space exists on both sides of every {entity} placeholder. + Handles agglutinative suffixes like {keyword}ren so the suffix becomes + a separate token and the capture group is not contaminated. + """ + return re.sub(r'(\{[^}]+\})', r' \1 ', text) + + +def normalize_utterance(text: str) -> str: + """ + Normalize a plain utterance (inference query) for consistent matching. + Does NOT touch entity placeholder syntax. + @param text: input utterance + @return: normalized text + """ + text = drop_apostrophes(text) + text = normalize_whitespace(text) + return text + + def normalize_example(example: str) -> str: - return clean_braces(translate_padatious(example)) + text = clean_braces(translate_padatious(example)) + text = drop_apostrophes(text) + text = normalize_whitespace(text) + return text diff --git a/padacioso/version.py b/padacioso/version.py index a3d5a94..d04a515 100644 --- a/padacioso/version.py +++ b/padacioso/version.py @@ -1,6 +1,8 @@ # START_VERSION_BLOCK VERSION_MAJOR = 1 VERSION_MINOR = 0 -VERSION_BUILD = 0 -VERSION_ALPHA = 0 +VERSION_BUILD = 2 +VERSION_ALPHA = 1 # END_VERSION_BLOCK + +__version__ = f"{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_BUILD}" + (f"a{VERSION_ALPHA}" if VERSION_ALPHA else "") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7bebb24 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "padacioso" +dynamic = ["version"] +description = "dead simple intent parser" +readme = "readme.md" +license = { text = "Apache-2.0" } +authors = [{ name = "jarbasai", email = "jarbasai@mailfence.com" }] +requires-python = ">=3.8" +dependencies = [ + "simplematch", +] + +[project.optional-dependencies] +extras = [ + "ovos-plugin-manager>=0.5.0,<3.0.0", + "ovos-utils>=0.3.5,<1.0.0", + "langcodes", +] +test = [ + "ovos-plugin-manager>=0.5.0,<3.0.0", + "ovos-utils>=0.3.5,<1.0.0", + "ovos-bus-client>=0.0.8,<1.0.0", + "langcodes", +] + +[project.urls] +Homepage = "https://github.com/OpenVoiceOS/padacioso" + +[project.entry-points."opm.pipeline"] +"ovos-padacioso-pipeline-plugin" = "padacioso.opm:PadaciosoPipeline" + +[tool.setuptools.dynamic] +version = { attr = "padacioso.version.__version__" } + +[tool.setuptools.packages.find] +where = ["."] +include = ["padacioso*"] diff --git a/renovate.json b/renovate.json new file mode 100644 index 0000000..5db72dd --- /dev/null +++ b/renovate.json @@ -0,0 +1,6 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": [ + "config:recommended" + ] +} diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index fb508d3..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -simplematch \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 79391c0..0000000 --- a/setup.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -from setuptools import setup - -BASEDIR = os.path.abspath(os.path.dirname(__file__)) - - -def get_version(): - """ Find the version of the package""" - version_file = os.path.join(BASEDIR, 'padacioso', 'version.py') - major, minor, build, alpha = (None, None, None, None) - with open(version_file) as f: - for line in f: - if 'VERSION_MAJOR' in line: - major = line.split('=')[1].strip() - elif 'VERSION_MINOR' in line: - minor = line.split('=')[1].strip() - elif 'VERSION_BUILD' in line: - build = line.split('=')[1].strip() - elif 'VERSION_ALPHA' in line: - alpha = line.split('=')[1].strip() - - if ((major and minor and build and alpha) or - '# END_VERSION_BLOCK' in line): - break - version = f"{major}.{minor}.{build}" - if alpha and int(alpha) > 0: - version += f"a{alpha}" - return version - - -def package_files(directory): - paths = [] - for (path, directories, filenames) in os.walk(directory): - for filename in filenames: - paths.append(os.path.join('..', path, filename)) - return paths - - -def required(requirements_file): - """ Read requirements file and remove comments and empty lines. """ - with open(os.path.join(BASEDIR, requirements_file), 'r') as f: - requirements = f.read().splitlines() - if 'MYCROFT_LOOSE_REQUIREMENTS' in os.environ: - print('USING LOOSE REQUIREMENTS!') - requirements = [r.replace('==', '>=').replace('~=', '>=') for r in requirements] - return [pkg for pkg in requirements - if pkg.strip() and not pkg.startswith("#")] - - -PLUGIN_ENTRY_POINT = 'ovos-padacioso-pipeline-plugin=padacioso.opm:PadaciosoPipeline' - - -setup( - name='padacioso', - version=get_version(), - packages=['padacioso'], - package_data={'': package_files('padacioso')}, - url='https://github.com/OpenVoiceOS/padacioso', - license='apache-2.0', - author='jarbasai', - author_email='jarbasai@mailfence.com', - install_requires=required("requirements.txt"), - extras_require={ - 'extras': required('extras.txt') - }, - entry_points={'opm.pipeline': PLUGIN_ENTRY_POINT}, - description='dead simple intent parser' -) diff --git a/test/test_padacioso.py b/test/test_padacioso.py index d4dfd5a..6122169 100644 --- a/test/test_padacioso.py +++ b/test/test_padacioso.py @@ -1,4 +1,5 @@ from padacioso import IntentContainer +from padacioso.bracket_expansion import expand_parentheses import unittest @@ -229,3 +230,251 @@ def test_add_padatious_wildcard_intent(self): self.assertEqual(match['entities']['word0'], 'neon') self.assertEqual(match['entities']['word1'], 'neon') + # normalization unit tests + def test_normalize_whitespace_util(self): + from padacioso.bracket_expansion import normalize_whitespace + self.assertEqual(normalize_whitespace("hello world"), "hello world") + self.assertEqual(normalize_whitespace(" hello world "), "hello world") + self.assertEqual(normalize_whitespace("one\ttwo\nthree"), "one two three") + self.assertEqual(normalize_whitespace("already fine"), "already fine") + self.assertEqual(normalize_whitespace(""), "") + + def test_drop_apostrophes_util(self): + from padacioso.bracket_expansion import drop_apostrophes + # apostrophes replaced with space to preserve word boundaries + self.assertEqual(drop_apostrophes("what's up"), "what s up") + # U+2019 RIGHT SINGLE QUOTATION MARK + self.assertEqual(drop_apostrophes("what's up"), "what s up") + # U+2018 LEFT SINGLE QUOTATION MARK + self.assertEqual(drop_apostrophes("what's up"), "what s up") + # backtick + self.assertEqual(drop_apostrophes("what`s up"), "what s up") + # U+02BC MODIFIER LETTER APOSTROPHE + self.assertEqual(drop_apostrophes("whatʼs up"), "what s up") + # no apostrophe — unchanged + self.assertEqual(drop_apostrophes("what s up"), "what s up") + + def test_normalize_example_util(self): + from padacioso.bracket_expansion import normalize_example + self.assertEqual(normalize_example(" hello world "), "hello world") + # apostrophe replaced with space, then whitespace collapsed + self.assertEqual(normalize_example("what's up"), "what s up") + self.assertEqual(normalize_example("{{entity}}"), "{entity}") + # combined: curly apostrophe + whitespace + braces cleaned + self.assertEqual(normalize_example(" what's {{place}} "), "what s {place}") + + # normalization integration tests + def test_double_whitespace_in_query(self): + """Extra whitespace in the spoken query should not prevent matching.""" + container = IntentContainer() + container.add_intent('hello', ['hello world']) + self.assertEqual(container.calc_intent('hello world')['name'], 'hello') + self.assertEqual(container.calc_intent(' hello world ')['name'], 'hello') + self.assertEqual(container.calc_intent('hello world')['name'], 'hello') + + def test_double_whitespace_in_training(self): + """Extra whitespace in training data should be collapsed at registration time.""" + container = IntentContainer() + container.add_intent('hello', ['hello world']) + self.assertIn('hello world', container.intent_samples['hello']) + self.assertNotIn('hello world', container.intent_samples['hello']) + self.assertEqual(container.calc_intent('hello world')['name'], 'hello') + + def test_apostrophe_variants_in_query(self): + """All apostrophe variants in a query should match — both sides normalize the same way.""" + container = IntentContainer() + container.add_intent('whats_up', ["what's up"]) + # stored as "what s up"; query variants also reduce to "what s up" + self.assertEqual(container.calc_intent("what s up")['name'], 'whats_up') + self.assertEqual(container.calc_intent("what's up")['name'], 'whats_up') + # U+2019 RIGHT SINGLE QUOTATION MARK — common from voice STT + self.assertEqual(container.calc_intent("what's up")['name'], 'whats_up') + # backtick + self.assertEqual(container.calc_intent('what`s up')['name'], 'whats_up') + # U+02BC MODIFIER LETTER APOSTROPHE + self.assertEqual(container.calc_intent("whatʼs up")['name'], 'whats_up') + + def test_apostrophe_variants_in_training(self): + """Apostrophes in training examples should be replaced with spaces at registration time.""" + container = IntentContainer() + container.add_intent('whats_up', ["what's up"]) + self.assertIn("what s up", container.intent_samples['whats_up']) + self.assertNotIn("what's up", container.intent_samples['whats_up']) + # curly apostrophe (U+2018) normalizes the same way + container.add_intent('curly_test', ["what's new"]) + self.assertIn("what s new", container.intent_samples['curly_test']) + + def test_apostrophe_with_entity(self): + """Apostrophe normalization should work alongside entity extraction.""" + container = IntentContainer() + container.add_intent('navigate', ["navigate to {place}"]) + match = container.calc_intent("navigate to the store") + self.assertEqual(match['name'], 'navigate') + self.assertEqual(match['entities']['place'], 'the store') + + def test_whitespace_with_entity(self): + """Whitespace normalization should not corrupt extracted entity values.""" + container = IntentContainer() + container.add_intent('buy', ['buy {item}']) + match = container.calc_intent('buy milk') + self.assertEqual(match['name'], 'buy') + self.assertEqual(match['entities']['item'], 'milk') + + def test_leading_trailing_whitespace_query(self): + """Leading/trailing whitespace on the query should be stripped.""" + container = IntentContainer() + container.add_intent('hello', ['hello']) + self.assertEqual(container.calc_intent(' hello ')['name'], 'hello') + + def test_mixed_normalization(self): + """Combined apostrophe and whitespace issues should both be handled.""" + container = IntentContainer() + container.add_intent('whats_up', ["what's up"]) + # curly apostrophe + double space → "what s up" on both sides + self.assertEqual(container.calc_intent("what's up")['name'], 'whats_up') + self.assertEqual(container.calc_intent("what's up")['name'], 'whats_up') + + def test_entity_suffix_spacing(self): + """Agglutinative suffixes attached to {entity} placeholders should still match.""" + container = IntentContainer() + # Basque-style patterns where suffix is glued to the placeholder + container.add_intent('doktore', [ + 'zeintzuk ziren {keyword}ren doktore-ikasleak', + 'nork egin zuen doktoretza {keyword}rekin', + ]) + # the suffix is separated at training time so the entity captures just the keyword + match = container.calc_intent('zeintzuk ziren Einstein ren doktore-ikasleak') + self.assertEqual(match['name'], 'doktore') + self.assertEqual(match['entities']['keyword'], 'Einstein') + + match = container.calc_intent('nork egin zuen doktoretza Curie rekin') + self.assertEqual(match['name'], 'doktore') + self.assertEqual(match['entities']['keyword'], 'Curie') + + +class TestExpandParentheses(unittest.TestCase): + + # --- no-op cases --- + + def test_plain_string(self): + self.assertEqual(expand_parentheses("hello world"), ["hello world"]) + + def test_empty_string(self): + self.assertEqual(expand_parentheses(""), [""]) + + def test_entity_placeholder_untouched(self): + # {entity} must survive expansion unchanged + self.assertEqual(expand_parentheses("buy {item}"), ["buy {item}"]) + + def test_typed_entity_untouched(self): + self.assertEqual(expand_parentheses("set volume {level:int}"), ["set volume {level:int}"]) + + # --- (a|b) alternatives --- + + def test_two_alternatives(self): + self.assertEqual(expand_parentheses("(hello|hi)"), + sorted(["hello", "hi"])) + + def test_three_alternatives(self): + self.assertEqual(expand_parentheses("(hello|hi|hey) world"), + sorted(["hello world", "hey world", "hi world"])) + + def test_alternatives_at_end(self): + self.assertEqual(expand_parentheses("turn (on|off)"), + sorted(["turn off", "turn on"])) + + def test_alternatives_in_middle(self): + self.assertEqual(expand_parentheses("I (want|need) coffee"), + sorted(["I need coffee", "I want coffee"])) + + def test_two_independent_groups(self): + self.assertEqual( + expand_parentheses("(a|b) (c|d)"), + sorted(["a c", "a d", "b c", "b d"]) + ) + + def test_three_independent_groups(self): + self.assertEqual( + expand_parentheses("(a|b) (c|d) (e|f)"), + sorted(["a c e", "a c f", "a d e", "a d f", + "b c e", "b c f", "b d e", "b d f"]) + ) + + def test_empty_alternative_makes_optional(self): + # (word|) is the canonical optional form + self.assertEqual(expand_parentheses("hello (world|)"), + sorted(["hello", "hello world"])) + + def test_single_item_group(self): + # (word) with no pipe — parens stripped, single result + result = expand_parentheses("hello (world)") + self.assertEqual(result, ["hello world"]) + + # --- [optional] syntax --- + + def test_optional_word(self): + self.assertEqual(expand_parentheses("hey [world]"), + sorted(["hey", "hey world"])) + + def test_optional_at_start(self): + self.assertEqual(expand_parentheses("[please] turn on"), + sorted(["please turn on", "turn on"])) + + def test_optional_at_end(self): + self.assertEqual(expand_parentheses("turn on [the light]"), + sorted(["turn on", "turn on the light"])) + + def test_two_optional_groups(self): + self.assertEqual( + expand_parentheses("[please] turn [on]"), + sorted(["please turn", "please turn on", "turn", "turn on"]) + ) + + def test_optional_entity_placeholder(self): + self.assertEqual(expand_parentheses("hi [{person}|people]"), + sorted(["hi", "hi {person}", "hi people"])) + + # --- nested / combined --- + + def test_alternatives_inside_optional(self): + self.assertEqual( + expand_parentheses("set [the] (light|fan)"), + sorted(["set light", "set fan", "set the light", "set the fan"]) + ) + + def test_optional_and_alternatives_combined(self): + result = expand_parentheses("(turn|switch) [the] (light|fan) (on|off)") + self.assertEqual(len(result), 16) # 2 * 2 * 2 * 2 + self.assertIn("turn the light on", result) + self.assertIn("switch fan off", result) + + def test_entity_with_alternatives(self): + self.assertEqual( + expand_parentheses("(buy|purchase) {item}"), + sorted(["buy {item}", "purchase {item}"]) + ) + + def test_entity_with_optional(self): + self.assertEqual( + expand_parentheses("eat [some] {fruit}"), + sorted(["eat {fruit}", "eat some {fruit}"]) + ) + + # --- whitespace handling --- + + def test_leading_trailing_spaces_stripped(self): + for result in expand_parentheses(" hello "): + self.assertEqual(result, result.strip()) + + def test_internal_spaces_preserved(self): + results = expand_parentheses("(good morning|hi) there") + self.assertIn("good morning there", results) + self.assertIn("hi there", results) + + # --- deduplication --- + + def test_duplicate_alternatives_deduplicated(self): + # (a|a) should produce one "a", not two + result = expand_parentheses("(hello|hello)") + self.assertEqual(result, ["hello"]) +