diff --git a/.coveragerc b/.coveragerc index 224e586..14f9abe 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,9 +1,14 @@ # .coveragerc to control coverage.py [run] branch = True -source = */nlp_qrmine/* +source = qrmine # omit = bad_file.py +[paths] +source = + src/ + */site-packages/ + [report] # Regexes for lines to exclude from consideration exclude_lines = diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index f3b6a2e..7b25612 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,7 +14,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.7' + python-version: '3.11' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 39112e1..f742724 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -13,8 +13,8 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7] - os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.11"] + os: [ubuntu-latest, macos-13, windows-latest] runs-on: ${{ matrix.os }} timeout-minutes: 20 steps: diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 5615dac..9018711 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,9 +12,9 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5.1.1 with: - python-version: '3.7' + python-version: '3.11' - name: Install dependencies run: | python -m pip install --upgrade pip @@ -24,7 +24,7 @@ jobs: python setup.py bdist_wheel - name: Publish distribution 📦 to PyPI if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@master with: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml index 39ff65a..2b436ed 100644 --- a/.github/workflows/tox.yml +++ b/.github/workflows/tox.yml @@ -1,4 +1,4 @@ -name: Tox on release +name: Tox Test on: push: @@ -9,22 +9,22 @@ jobs: build: runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 20 strategy: max-parallel: 4 matrix: - python-version: [3.7] + python-version: ["3.11"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5.1.1 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r requirements.txt -r dev-requirements.txt + pip install -r dev-requirements.txt -r requirements.txt python -m spacy download en_core_web_sm - name: Test with tox run: | diff --git a/.readthedocs.yml b/.readthedocs.yml index 1133df0..a2bcab3 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -16,7 +16,12 @@ sphinx: formats: - pdf +build: + os: ubuntu-22.04 + tools: + python: "3.11" + python: - version: 3.8 install: - requirements: docs/requirements.txt + - {path: ., method: pip} diff --git a/README.md b/README.md index 35f6591..62e9d85 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,10 @@ QRMine is a suite of qualitative research (QR) data mining tools in Python using ## How to install +* Requires Python 3.11 and a CPU that support AVX instructions ```text - -pip install qrmine +pip install uv +uv pip install qrmine python -m spacy download en_core_web_sm ``` diff --git a/dev-requirements.txt b/dev-requirements.txt index e96e708..f36f95c 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,162 +1,146 @@ -# -# This file is autogenerated by pip-compile with python 3.7 -# To update, run: -# -# pip-compile dev-requirements.in -# -alabaster==0.7.12 +# This file was autogenerated by uv via the following command: +# uv pip compile dev-requirements.in -o dev-requirements.txt --universal +alabaster==1.0.0 # via sphinx -attrs==21.4.0 - # via pytest -babel==2.9.1 +babel==2.16.0 # via sphinx -certifi==2021.10.8 +build==1.2.2.post1 + # via pip-tools +cachetools==5.5.0 + # via + # -c requirements.txt + # tox +certifi==2024.8.30 # via # -c requirements.txt # requests -charset-normalizer==2.0.12 +chardet==5.2.0 + # via tox +charset-normalizer==3.4.0 # via # -c requirements.txt # requests -click==8.1.2 +click==8.1.7 # via # -c requirements.txt # pip-tools +colorama==0.4.6 + # via + # -c requirements.txt + # build + # click + # pytest + # sphinx + # tox commonmark==0.9.1 # via recommonmark -coverage[toml]==6.3.2 +coverage==7.6.4 # via pytest-cov -distlib==0.3.4 +distlib==0.3.9 # via virtualenv -docutils==0.17.1 +docutils==0.21.2 # via # recommonmark # sphinx -filelock==3.6.0 +filelock==3.16.1 # via # tox # virtualenv -idna==3.3 +idna==3.10 # via # -c requirements.txt # requests -imagesize==1.3.0 +imagesize==1.4.1 # via sphinx -importlib-metadata==4.11.3 ; python_version < "3.8" +iniconfig==2.0.0 + # via pytest +jinja2==3.1.4 # via # -c requirements.txt - # click - # pep517 - # pluggy - # pytest # sphinx - # tox - # virtualenv -iniconfig==1.1.1 - # via pytest -jinja2==3.1.1 - # via sphinx -markupsafe==2.1.1 - # via jinja2 -packaging==21.3 +markupsafe==3.0.2 + # via + # -c requirements.txt + # jinja2 +packaging==24.2 # via # -c requirements.txt + # build + # pyproject-api # pytest # setuptools-scm # sphinx # tox -pep517==0.12.0 +pip==24.3.1 # via pip-tools -pip-tools==6.6.0 +pip-tools==7.4.1 # via -r dev-requirements.in -platformdirs==2.5.1 - # via virtualenv -pluggy==1.0.0 +platformdirs==4.3.6 # via - # pytest # tox -py==1.11.0 + # virtualenv +pluggy==1.5.0 # via # pytest # tox -pygments==2.11.2 - # via sphinx -pyparsing==3.0.7 +pygments==2.18.0 # via # -c requirements.txt - # packaging -pytest==7.1.2 + # sphinx +pyproject-api==1.8.0 + # via tox +pyproject-hooks==1.2.0 + # via + # build + # pip-tools +pytest==8.3.3 # via # -r dev-requirements.in # pytest-cov -pytest-cov==3.0.0 +pytest-cov==6.0.0 # via -r dev-requirements.in -pytz==2022.1 - # via - # -c requirements.txt - # babel recommonmark==0.7.1 # via -r dev-requirements.in -requests==2.27.1 +requests==2.32.3 # via # -c requirements.txt # sphinx -setuptools-scm==6.4.2 - # via -r dev-requirements.in -six==1.16.0 +setuptools==75.3.0 # via # -c requirements.txt - # tox - # virtualenv + # -r dev-requirements.in + # pip-tools + # setuptools-scm +setuptools-scm==8.1.0 + # via -r dev-requirements.in snowballstemmer==2.2.0 # via sphinx -sphinx==4.5.0 +sphinx==8.1.3 # via # -r dev-requirements.in # recommonmark -sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-applehelp==2.0.0 # via sphinx -sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-devhelp==2.0.0 # via sphinx -sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-htmlhelp==2.1.0 # via sphinx sphinxcontrib-jsmath==1.0.1 # via sphinx -sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-qthelp==2.0.0 # via sphinx -sphinxcontrib-serializinghtml==1.1.5 +sphinxcontrib-serializinghtml==2.0.0 # via sphinx -toml==0.10.2 - # via tox -tomli==2.0.1 - # via - # coverage - # pep517 - # pytest - # setuptools-scm -tox==3.24.5 +tox==4.23.2 # via -r dev-requirements.in -typing-extensions==4.1.1 - # via - # -c requirements.txt - # importlib-metadata -urllib3==1.26.9 +urllib3==2.2.3 # via # -c requirements.txt # requests -virtualenv==20.14.0 +virtualenv==20.27.1 # via tox -wheel==0.37.1 +wheel==0.45.0 # via # -c requirements.txt # -r dev-requirements.in # pip-tools -zipp==3.8.0 - # via - # -c requirements.txt - # importlib-metadata - # pep517 - -# The following packages are considered to be unsafe in a requirements file: -# pip -# setuptools diff --git a/notes/pip-tools.md b/notes/pip-tools.md index d656035..da4baa4 100644 --- a/notes/pip-tools.md +++ b/notes/pip-tools.md @@ -15,4 +15,10 @@ OR * pip install pre-commit -* pre-commit install \ No newline at end of file +* pre-commit install + +## uv + +* pip install uv +* uv pip compile setup.cfg -o requirements.txt --universal +* uv pip compile dev-requirements.in -o dev-requirements.txt --universal \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2c63dbb..89a5bed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,9 @@ [build-system] # AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! -requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "wheel"] +requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] -# See configuration details in https://github.com/pypa/setuptools_scm +# For smarter version schemes and other configuration options, +# check out https://github.com/pypa/setuptools_scm version_scheme = "no-guess-dev" diff --git a/requirements.txt b/requirements.txt index debaaba..de9e6f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,116 +1,134 @@ -# -# This file is autogenerated by pip-compile with python 3.7 -# To update, run: -# -# pip-compile -# -absl-py==1.0.0 +# This file was autogenerated by uv via the following command: +# uv pip compile setup.cfg -o requirements.txt --universal +absl-py==2.1.0 # via # tensorboard # tensorflow astunparse==1.6.3 # via tensorflow -blis==0.7.7 - # via - # spacy - # thinc -cached-property==1.5.2 - # via h5py -cachetools==5.0.0 +blis==0.7.11 + # via thinc +cachetools==5.5.0 # via # google-auth # textacy -catalogue==1.0.0 +catalogue==2.0.10 # via # spacy + # srsly + # textacy # thinc -certifi==2023.7.22 +certifi==2024.8.30 # via requests -charset-normalizer==2.0.12 +charset-normalizer==3.4.0 # via requests -click==8.1.2 - # via qrmine (setup.py) -cycler==0.11.0 +click==8.1.7 + # via + # qrmine (setup.cfg) + # typer +cloudpathlib==0.20.0 + # via weasel +colorama==0.4.6 ; sys_platform == 'win32' or platform_system == 'Windows' + # via + # click + # tqdm + # wasabi +confection==0.1.5 + # via + # thinc + # weasel +contourpy==1.3.0 # via matplotlib -cymem==2.0.6 +cycler==0.12.1 + # via matplotlib +cymem==2.0.8 # via # preshed # spacy # thinc -cytoolz==0.11.2 +cytoolz==1.0.0 # via textacy -flatbuffers==2.0 +flatbuffers==24.3.25 # via tensorflow -fonttools==4.31.2 +floret==0.10.5 + # via textacy +fonttools==4.54.1 # via matplotlib -gast==0.5.3 +gast==0.4.0 # via tensorflow -google-auth==2.6.2 +google-auth==2.36.0 # via # google-auth-oauthlib # tensorboard -google-auth-oauthlib==0.4.6 +google-auth-oauthlib==1.0.0 # via tensorboard google-pasta==0.2.0 # via tensorflow -grpcio==1.44.0 +grpcio==1.67.1 # via # tensorboard # tensorflow -h5py==3.6.0 +h5py==3.12.1 # via tensorflow -idna==3.3 +idna==3.10 # via requests -imbalanced-learn==0.9.0 - # via qrmine (setup.py) -importlib-metadata==4.11.3 ; python_version < "3.8" - # via - # catalogue - # click - # markdown - # qrmine (setup.py) -jellyfish==0.9.0 +imbalanced-learn==0.12.4 + # via qrmine (setup.cfg) +jellyfish==1.1.0 # via textacy -joblib==1.2.0 +jinja2==3.1.4 + # via spacy +joblib==1.4.2 # via # imbalanced-learn # mlxtend # scikit-learn # textacy -keras==2.8.0 +keras==2.13.1 # via tensorflow -keras-preprocessing==1.1.2 - # via tensorflow -kiwisolver==1.4.2 +kiwisolver==1.4.7 # via matplotlib -libclang==13.0.0 +langcodes==3.4.1 + # via spacy +language-data==1.2.0 + # via langcodes +libclang==18.1.1 # via tensorflow -markdown==3.3.6 +marisa-trie==1.2.1 + # via language-data +markdown==3.7 # via tensorboard -matplotlib==3.5.1 +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.2 # via + # jinja2 + # werkzeug +matplotlib==3.9.2 + # via + # qrmine (setup.cfg) # mlxtend - # qrmine (setup.py) -mlxtend==0.19.0 - # via qrmine (setup.py) -murmurhash==1.0.6 +mdurl==0.1.2 + # via markdown-it-py +mlxtend==0.23.2 + # via qrmine (setup.cfg) +murmurhash==1.0.10 # via # preshed # spacy # thinc -networkx==2.6.3 +networkx==3.4.2 # via textacy -numpy==1.21.5 +numpy==1.24.3 # via # blis + # contourpy + # floret # h5py # imbalanced-learn - # keras-preprocessing # matplotlib # mlxtend - # opt-einsum # pandas - # pyemd # scikit-learn # scipy # spacy @@ -119,147 +137,178 @@ numpy==1.21.5 # textacy # thinc # xgboost +nvidia-nccl-cu12==2.23.4 ; platform_machine != 'aarch64' and platform_system == 'Linux' + # via xgboost oauthlib==3.2.2 # via requests-oauthlib -opt-einsum==3.3.0 +opt-einsum==3.4.0 # via tensorflow -packaging==21.3 - # via matplotlib -pandas==1.3.5 - # via - # mlxtend - # qrmine (setup.py) -pillow==9.3.0 - # via matplotlib -plac==1.1.3 +packaging==24.2 # via + # matplotlib # spacy + # tensorflow # thinc -preshed==3.0.6 + # weasel +pandas==2.1.0 ; python_full_version >= '3.12' + # via + # qrmine (setup.cfg) + # mlxtend +pandas==2.2.3 ; python_full_version < '3.12' + # via + # qrmine (setup.cfg) + # mlxtend +pillow==11.0.0 + # via matplotlib +preshed==3.0.9 # via # spacy # thinc -protobuf==3.20.0 +protobuf==4.25.5 # via # tensorboard # tensorflow -pyasn1==0.4.8 +pyasn1==0.6.1 # via # pyasn1-modules # rsa -pyasn1-modules==0.2.8 +pyasn1-modules==0.4.1 # via google-auth -pyemd==0.5.1 - # via textacy -pyparsing==3.0.7 +pydantic==1.10.19 # via - # matplotlib - # packaging -pyphen==0.12.0 + # confection + # spacy + # thinc + # weasel +pygments==2.18.0 + # via rich +pyparsing==3.2.0 + # via matplotlib +pyphen==0.17.0 # via textacy -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # matplotlib # pandas -pytz==2022.1 +pytz==2024.2 # via pandas -requests==2.31.0 +requests==2.32.3 # via # requests-oauthlib # spacy # tensorboard # textacy # vadersentiment -requests-oauthlib==1.3.1 + # weasel +requests-oauthlib==2.0.0 # via google-auth-oauthlib -rsa==4.8 +rich==13.9.4 + # via typer +rsa==4.9 # via google-auth -scikit-learn==1.0.2 +scikit-learn==1.5.2 # via + # qrmine (setup.cfg) # imbalanced-learn # mlxtend - # qrmine (setup.py) # textacy -scipy==1.7.3 +scipy==1.14.1 # via # imbalanced-learn # mlxtend # scikit-learn # textacy # xgboost +setuptools==75.3.0 + # via + # marisa-trie + # spacy + # tensorboard + # tensorflow + # thinc +shellingham==1.5.4 + # via typer six==1.16.0 # via - # absl-py # astunparse - # google-auth # google-pasta - # grpcio - # keras-preprocessing # python-dateutil # tensorflow -spacy==2.3.7 +smart-open==7.0.5 + # via weasel +spacy==3.7.5 # via - # qrmine (setup.py) + # qrmine (setup.cfg) # textacy -srsly==1.0.5 +spacy-legacy==3.0.12 + # via spacy +spacy-loggers==1.0.5 + # via spacy +srsly==2.4.8 # via + # confection # spacy - # textacy # thinc -tensorboard==2.8.0 + # weasel +tensorboard==2.13.0 # via tensorflow -tensorboard-data-server==0.6.1 - # via tensorboard -tensorboard-plugin-wit==1.8.1 +tensorboard-data-server==0.7.2 # via tensorboard -tensorflow==2.8.0 - # via qrmine (setup.py) -tensorflow-io-gcs-filesystem==0.24.0 +tensorflow==2.13.1 + # via qrmine (setup.cfg) +tensorflow-estimator==2.13.0 # via tensorflow -termcolor==1.1.0 - # via tensorflow -textacy==0.10.0 - # via qrmine (setup.py) -tf-estimator-nightly==2.8.0.dev2021122109 +tensorflow-io-gcs-filesystem==0.31.0 + # via + # qrmine (setup.cfg) + # tensorflow +termcolor==2.5.0 # via tensorflow -thinc==7.4.5 +textacy==0.13.0 + # via qrmine (setup.cfg) +thinc==8.2.5 # via spacy -threadpoolctl==3.1.0 +threadpoolctl==3.5.0 # via # imbalanced-learn # scikit-learn -toolz==0.11.2 +toolz==1.0.0 # via cytoolz -tqdm==4.64.0 +tqdm==4.67.0 # via # spacy # textacy - # thinc -typing-extensions==4.1.1 +typer==0.13.0 # via - # importlib-metadata - # kiwisolver + # spacy + # weasel +typing-extensions==4.5.0 + # via + # pydantic # tensorflow -urllib3==1.26.9 + # typer +tzdata==2024.2 + # via pandas +urllib3==2.2.3 # via requests vadersentiment==3.3.2 - # via qrmine (setup.py) -wasabi==0.9.1 + # via qrmine (setup.cfg) +wasabi==1.1.3 # via # spacy # thinc -werkzeug==2.1.1 + # weasel +weasel==0.4.1 + # via spacy +werkzeug==3.1.3 # via tensorboard -wheel==0.37.1 +wheel==0.45.0 # via # astunparse # tensorboard -wrapt==1.14.0 - # via tensorflow -xgboost==1.5.2 - # via qrmine (setup.py) -zipp==3.8.0 - # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -# setuptools +wrapt==1.16.0 + # via + # smart-open + # tensorflow +xgboost==2.1.2 + # via qrmine (setup.cfg) diff --git a/setup.cfg b/setup.cfg index 69abafb..e6953b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,42 +1,57 @@ # This file is used to configure your project. # Read more about the various options under: -# http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html +# https://setuptools.pypa.io/en/latest/references/keywords.html [metadata] name = qrmine description = Qualitative Research support tools in Python! author = beapen author_email = github@gulfdoctor.net -license = gpl3 -url = https://github.com/dermatologist/nlp-qrmine +license = GPL-3.0-only +# license_files = LICENSE.txt +# long_description = file: README.rst +# long_description_content_type = text/x-rst; charset=UTF-8 long_description = file: README.md long_description_content_type = text/markdown +url = https://github.com/dermatologist/nlp-qrmine +# Add here related links, for example: +project_urls = + Documentation = https://arxiv.org/abs/2003.13519 +# Source = https://github.com/pyscaffold/pyscaffold/ +# Changelog = https://pyscaffold.org/en/latest/changelog.html +# Tracker = https://github.com/pyscaffold/pyscaffold/issues +# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold +# Download = https://pypi.org/project/PyScaffold/#files +# Twitter = https://twitter.com/PyScaffold + # Change if running only on Windows, Mac or Linux (comma-separated) platforms = any + # Add here all kinds of additional classifiers as defined under -# https://pypi.python.org/pypi?%3Aaction=list_classifiers +# https://pypi.org/classifiers/ classifiers = Intended Audience :: Science/Research Development Status :: 4 - Beta Operating System :: OS Independent - Programming Language :: Python - Programming Language :: Python :: 3.5 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.11 Topic :: Scientific/Engineering :: Information Analysis + [options] zip_safe = False packages = find_namespace: include_package_data = True package_dir = =src -# Add here dependencies of your project (semicolon-separated), e.g. -# install_requires = numpy; scipy -# install_requires = numpy; pandas; matplotlib; imbalanced-learn==0.4.3; scikit-learn==0.20.4; xgboost; mlxtend; Keras; keras-text; click; vaderSentiment; spacy; textacy; tensorflow>=1.13.1 -# Format +# Require a min/specific Python version (comma-separated conditions) +# python_requires = >=3.8 +# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. +# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in +# new major versions. This works if the required packages follow Semantic Versioning. +# For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" pandas @@ -47,18 +62,10 @@ install_requires = vaderSentiment xgboost mlxtend - spacy==2.3.7 - textacy==0.10.0 - tensorflow - # below is not supported by pypi - # en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm - -# install_requires=pyemd; numpy; pandas; matplotlib; click; scikit-learn==0.20.4; imbalanced-learn; vaderSentiment; xgboost; mlxtend; spacy>=2.2.0,<3.0.0; textacy==0.8.0; tensorflow - - - -# Add here test requirements (semicolon-separated) -tests_require = pytest; pytest-cov + spacy + textacy + tensorflow<=2.13.1 + tensorflow-io-gcs-filesystem<=0.31.0 [options.packages.find] where = src @@ -70,6 +77,12 @@ exclude = # `pip install qrmine[PDF]` like: # PDF = ReportLab; RXP +# Add here test requirements (semicolon/line-separated) +testing = + setuptools + pytest + pytest-cov + [options.entry_points] # Add here console scripts like: # console_scripts = @@ -80,19 +93,15 @@ exclude = # And any other entry points, for example: # pyscaffold.cli = # awesome = pyscaffoldext.awesome.extension:AwesomeExtension - console_scripts = qrmine = qrmine.main:main_routine -[test] -# py.test options when running `python setup.py test` -addopts = tests - [tool:pytest] -# Options for py.test: -# Specify command line options as you would do when invoking py.test directly. +# Specify command line options as you would do when invoking pytest directly. # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml # in order to write a coverage file that can be read by Jenkins. +# CAUTION: --cov flags may prohibit setting breakpoints while debugging. +# Comment those flags to avoid this pytest issue. addopts = --verbose norecursedirs = @@ -111,14 +120,24 @@ universal = 1 source_dir = docs build_dir = docs/_build +testpaths = tests +# Use pytest markers to select/deselect specific tests +# markers = +# slow: mark tests as slow (deselect with '-m "not slow"') +# system: mark end-to-end system tests + [devpi:upload] # Options for the devpi: PyPI server and packaging tool # VCS export must be deactivated since we are using setuptools-scm -no-vcs = 1 +no_vcs = 1 formats = bdist_wheel [flake8] # Some sane defaults for the code style checker flake8 +max_line_length = 88 +extend_ignore = E203, W503 +# ^ Black-compatible +# E203 and W503 have edge cases handled by black exclude = .tox build @@ -129,5 +148,5 @@ exclude = [pyscaffold] # PyScaffold's parameters when the project was created. # This will be used when updating. Do not change! -version = 4.0.2 +version = 4.6 package = qrmine diff --git a/setup.py b/setup.py index 8a2ef86..0f264c5 100644 --- a/setup.py +++ b/setup.py @@ -2,10 +2,11 @@ Setup file for qrmine. Use setup.cfg to configure your project. - This file was generated with PyScaffold 4.0.2. + This file was generated with PyScaffold 4.6. PyScaffold helps you to put up the scaffold of your new Python project. Learn more under: https://pyscaffold.org/ """ + from setuptools import setup if __name__ == "__main__": diff --git a/src/qrmine/content.py b/src/qrmine/content.py index 576a246..3344a80 100644 --- a/src/qrmine/content.py +++ b/src/qrmine/content.py @@ -19,13 +19,14 @@ import operator -import en_core_web_sm +# import en_core_web_sm +import textacy class Content(object): def __init__(self, content): self._content = content - self._nlp = en_core_web_sm.load() + self._nlp = textacy.load_spacy_lang("en_core_web_sm") self._processed = self._nlp(self._content) self._lemma = {} self._pos = {} diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py index 68ee457..12b75a3 100644 --- a/src/qrmine/mlqrmine.py +++ b/src/qrmine/mlqrmine.py @@ -100,7 +100,7 @@ def read_csv(self): def mark_missing(self): self._dataset_original = self._dataset - self._dataset = self._dataset.replace('', numpy.NaN) + self._dataset = self._dataset.replace('', numpy.nan) self._dataset.dropna(inplace=True) def restore_mark_missing(self): diff --git a/src/qrmine/network.py b/src/qrmine/network.py index 7f4f2f2..c012637 100644 --- a/src/qrmine/network.py +++ b/src/qrmine/network.py @@ -1,6 +1,4 @@ -import textacy.network - - +import textacy.viz.network as network class Network(object): def __init__(self): @@ -10,15 +8,15 @@ def __init__(self): self._axis = None def sents_to_network(self, sents): - self._graph = textacy.network.sents_to_semantic_network(sents, normalize='lemma', edge_weighting='cosine') + self._graph = network.sents_to_semantic_network(sents, normalize='lemma', edge_weighting='cosine') return self._graph def terms_to_network(self, terms): - self._graph = textacy.network.terms_to_semantic_network(terms, normalize='lemma', edge_weighting='cosine') + self._graph = network.terms_to_semantic_network(terms, normalize='lemma', edge_weighting='cosine') return self._graph def draw_graph(self, draw=False): - self._axis = textacy.viz.network.draw_semantic_network(self._graph, node_weights=None, spread=3.0, + self._axis = network.draw_semantic_network(self._graph, node_weights=None, spread=3.0, draw_nodes=draw, base_node_size=300, node_alpha=0.25, line_width=0.5, line_alpha=0.1, diff --git a/src/qrmine/nlp_qrmine.py b/src/qrmine/nlp_qrmine.py index 2e131b9..44209c9 100644 --- a/src/qrmine/nlp_qrmine.py +++ b/src/qrmine/nlp_qrmine.py @@ -1,6 +1,6 @@ import subprocess import textacy -from textacy.vsm.vectorizers import Vectorizer +from textacy.representations.vectorizers import Vectorizer import textacy.tm from textacy import preprocessing @@ -15,7 +15,7 @@ def __init__(self): self._min_occurrence_for_topic = 2 self._common_verbs = 10 # create an empty corpus - self._en = textacy.load_spacy_lang('en_core_web_sm', disable=('parser',)) + self._en = textacy.load_spacy_lang('en_core_web_sm') self._corpus = textacy.Corpus(lang=self._en) self._content = None self._model = None @@ -24,8 +24,8 @@ def __init__(self): self._terms = None self._doc_term_matrix = None self._doc_topic_matrix = None - self._vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth', - norm='l2', min_df=3, max_df=0.95, max_n_terms=100000) + self._vectorizer = Vectorizer(tf_type='linear', idf_type='smooth', + norm='l2', min_df=2, max_df=0.95, max_n_terms=100000) @property def content(self): @@ -58,9 +58,9 @@ def get_git_revision_short_hash(self): # return subprocess.check_output(['git', 'log', '-1', '--format=%cd']).strip().decode("utf-8")[10:] def print_categories(self, doc, num=10): - bot = doc._.to_bag_of_terms(ngrams=(1, 2, 3), named_entities=False, normalize='lemma', weighting='freq', - as_strings=True, filter_stops=True, filter_punct=True, filter_nums=True, min_freq=2, - drop_determiners=True, include_types=["NOUN", "VERB"]) + textacy.spacier.extensions.set_doc_extensions("extract.bags") + bot = doc._.to_bag_of_terms(by='lemma_', weighting='freq', + ngs=(1,2,3), ents=True, ncs=True, dedupe=True) categories = sorted(bot.items(), key=lambda x: x[1], reverse=True)[:num] output = [] to_return = [] @@ -86,7 +86,7 @@ def category_basket(self, num=10): for index, title in enumerate(self._content.titles): # QRMines content should be set content = self._content.documents[index] this_record = Content(content) - doc = textacy.make_spacy_doc(this_record.doc) + doc = textacy.make_spacy_doc(this_record.doc, lang=self._en) item_basket.append(self.print_categories(doc, num)) return item_basket # Example return: @@ -199,7 +199,8 @@ def process_content(self): # 2-Jan-2020 textacy new version, breaking change # replace numbers with NUM, remove punct and convert to lower case - doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower() + # doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower() + doc_text = preprocessing.replace.numbers(preprocessing.remove.punctuation(document)).lower() doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en) self._corpus.add_doc(doc) @@ -216,8 +217,8 @@ def filter_content(self, titles): # textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True), # metadata=metadata) #doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True) - doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower() - + # doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower() + doc_text = preprocessing.replace.numbers(preprocessing.remove.punctuation(document)).lower() doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en) self._corpus.add_doc(doc) @@ -226,16 +227,15 @@ def filter_content(self, titles): self.load_matrix() def load_matrix(self): - self._doc_term_matrix = self._vectorizer.fit_transform( - (documents._.to_terms_list(ngrams=(1, 2, 3), named_entities=True, - as_strings=True, filter_stops=True, - filter_punct=True, filter_nums=True, - min_freq=2) - for documents in self._corpus.docs)) + textacy.spacier.extensions.set_doc_extensions("extract.keyterms") + terms = ((term.text for term in textacy.extract.terms(doc, ngs=1, ents=True))for doc in self._corpus.docs) + self._doc_term_matrix = self._vectorizer.fit_transform(terms) self._numdocs, self._terms = self._doc_term_matrix.shape - self._model = textacy.tm.TopicModel('nmf', n_topics=self._numdocs) + self._model = textacy.tm.TopicModel('lda', n_topics=self._numdocs) self._model.fit(self._doc_term_matrix) + try: + self._doc_topic_matrix = self._model.transform(self._doc_term_matrix) - self._doc_topic_matrix = self._model.transform(self._doc_term_matrix) - - _, self._numtopics = self._doc_topic_matrix.shape + _, self._numtopics = self._doc_topic_matrix.shape + except ValueError: + print("No topics found") diff --git a/tests/conftest.py b/tests/conftest.py index 0b879b1..a08aa23 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,10 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- """ Dummy conftest.py for qrmine. If you don't know what this is for, just leave it empty. Read more about conftest.py under: - https://pytest.org/latest/plugins.html + - https://docs.pytest.org/en/stable/fixture.html + - https://docs.pytest.org/en/stable/writing_plugins.html """ -from __future__ import print_function, absolute_import, division # import pytest diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 1589d48..4ad331d 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -25,6 +25,7 @@ def test_generate_dict(corpus_fixture, capsys, q): all_interviews = Content(corpus_fixture.content) q.print_dict(all_interviews, num) captured = capsys.readouterr() + print(captured.out) assert 'code' in captured.out def test_generate_topics(corpus_fixture, capsys, q): @@ -32,18 +33,21 @@ def test_generate_topics(corpus_fixture, capsys, q): q.process_content() q.print_topics() captured = capsys.readouterr() + print(captured.out) assert 'TOPIC' in captured.out def test_category_basket(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_basket()) captured = capsys.readouterr() + print(captured.out) assert 'theory' in captured.out def test_category_association(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_association()) captured = capsys.readouterr() + print(captured.out) assert 'theory' in captured.out diff --git a/tox.ini b/tox.ini index 18d1eb4..3eb707d 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ [tox] minversion = 2.4 -envlist = py37, integration +envlist = py311, integration [testenv] setenv = TOXINIDIR = {toxinidir}