diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index f742724..3693dc2 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -13,27 +13,27 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: ["3.11"] os: [ubuntu-latest, macos-13, windows-latest] runs-on: ${{ matrix.os }} timeout-minutes: 20 steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + - name: Install uv + uses: astral-sh/setup-uv@v5 with: - python-version: ${{ matrix.python-version }} - cache: 'pip' # caching pip dependencies + enable-cache: true + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" - name: run on mac if: startsWith(matrix.os, 'mac') run: | brew install libomp - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - python -m spacy download en_core_web_sm - - name: Test with pytest + - name: Install the project run: | - pip install pytest - pytest + uv sync --all-extras --dev + uv pip install pip + uv run python -m spacy download en_core_web_sm + - name: Run tests + run: uv run pytest tests diff --git a/.gitignore b/.gitignore index 64049e7..c29a2a9 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ __pycache__/* .idea .venv conda +uv.lock # Package files *.egg diff --git a/dev-requirements.in b/dev-requirements.in deleted file mode 100644 index 2b56355..0000000 --- a/dev-requirements.in +++ /dev/null @@ -1,11 +0,0 @@ -# dev-requirements.in --c requirements.txt -pytest-cov -pytest -recommonmark -sphinx>=3.2.1 -setuptools -setuptools_scm -wheel>=0.37.0 # conflicts with dependency of tensorflow -tox -pip-tools \ No newline at end of file diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index f36f95c..0000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,146 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile dev-requirements.in -o dev-requirements.txt --universal -alabaster==1.0.0 - # via sphinx -babel==2.16.0 - # via sphinx -build==1.2.2.post1 - # via pip-tools -cachetools==5.5.0 - # via - # -c requirements.txt - # tox -certifi==2024.8.30 - # via - # -c requirements.txt - # requests -chardet==5.2.0 - # via tox -charset-normalizer==3.4.0 - # via - # -c requirements.txt - # requests -click==8.1.7 - # via - # -c requirements.txt - # pip-tools -colorama==0.4.6 - # via - # -c requirements.txt - # build - # click - # pytest - # sphinx - # tox -commonmark==0.9.1 - # via recommonmark -coverage==7.6.4 - # via pytest-cov -distlib==0.3.9 - # via virtualenv -docutils==0.21.2 - # via - # recommonmark - # sphinx -filelock==3.16.1 - # via - # tox - # virtualenv -idna==3.10 - # via - # -c requirements.txt - # requests -imagesize==1.4.1 - # via sphinx -iniconfig==2.0.0 - # via pytest -jinja2==3.1.4 - # via - # -c requirements.txt - # sphinx -markupsafe==3.0.2 - # via - # -c requirements.txt - # jinja2 -packaging==24.2 - # via - # -c requirements.txt - # build - # pyproject-api - # pytest - # setuptools-scm - # sphinx - # tox -pip==24.3.1 - # via pip-tools -pip-tools==7.4.1 - # via -r dev-requirements.in -platformdirs==4.3.6 - # via - # tox - # virtualenv -pluggy==1.5.0 - # via - # pytest - # tox -pygments==2.18.0 - # via - # -c requirements.txt - # sphinx -pyproject-api==1.8.0 - # via tox -pyproject-hooks==1.2.0 - # via - # build - # pip-tools -pytest==8.3.3 - # via - # -r dev-requirements.in - # pytest-cov -pytest-cov==6.0.0 - # via -r dev-requirements.in -recommonmark==0.7.1 - # via -r dev-requirements.in -requests==2.32.3 - # via - # -c requirements.txt - # sphinx -setuptools==75.3.0 - # via - # -c requirements.txt - # -r dev-requirements.in - # pip-tools - # setuptools-scm -setuptools-scm==8.1.0 - # via -r dev-requirements.in -snowballstemmer==2.2.0 - # via sphinx -sphinx==8.1.3 - # via - # -r dev-requirements.in - # recommonmark -sphinxcontrib-applehelp==2.0.0 - # via sphinx -sphinxcontrib-devhelp==2.0.0 - # via sphinx -sphinxcontrib-htmlhelp==2.1.0 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==2.0.0 - # via sphinx -sphinxcontrib-serializinghtml==2.0.0 - # via sphinx -tox==4.23.2 - # via -r dev-requirements.in -urllib3==2.2.3 - # via - # -c requirements.txt - # requests -virtualenv==20.27.1 - # via tox -wheel==0.45.0 - # via - # -c requirements.txt - # -r dev-requirements.in - # pip-tools diff --git a/notes/conda.md b/notes/conda.md new file mode 100644 index 0000000..79eb6c8 --- /dev/null +++ b/notes/conda.md @@ -0,0 +1,12 @@ +conda create --name qrmine python=3.11 +conda activate qrmine + +conda install conda-forge::uv +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml +uv pip install -e . +python -m spacy download en_core_web_sm + + + +pip3 install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html \ No newline at end of file diff --git a/notes/new-process.md b/notes/new-process.md new file mode 100644 index 0000000..1ead749 --- /dev/null +++ b/notes/new-process.md @@ -0,0 +1,34 @@ +conda install conda-forge::uv +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml + +delete setup.cpg +delete requirements.txt, dev-requirements.txt, dev-requirements.in +remove deps from tox.ini + +uv pip install -e . +see pr.yml for GitHub actions +see pyproject.toml for pytorch cpu install +uv pip install -e . + +uv sync --all-extras --dev +uv pip install pip +uv run python -m spacy download en_core_web_sm + +pyproject.toml +requires = ["setuptools>=61.2", "wheel", "pip"] + +dev = [ + "setuptools", + "setuptools_scm", + "pytest", + "pytest-cov", + "tox", + "black", + "recommonmark", + "sphinx", + "wheel", + "twine", + "tox", +] + diff --git a/notes/pip-tools.md b/notes/pip-tools.md index da4baa4..c504a1e 100644 --- a/notes/pip-tools.md +++ b/notes/pip-tools.md @@ -21,4 +21,7 @@ OR * pip install uv * uv pip compile setup.cfg -o requirements.txt --universal -* uv pip compile dev-requirements.in -o dev-requirements.txt --universal \ No newline at end of file +* uv pip compile dev-requirements.in -o dev-requirements.txt --universal + +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 89a5bed..9fc3688 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,187 @@ [build-system] -# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! -requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"] +requires = ["setuptools>=61.2", "wheel", "pip"] build-backend = "setuptools.build_meta" -[tool.setuptools_scm] -# For smarter version schemes and other configuration options, -# check out https://github.com/pypa/setuptools_scm -version_scheme = "no-guess-dev" +[project] +name = "qrmine" +description = "Qualitative Research support tools in Python!" +authors = [{name = "beapen", email = "github@gulfdoctor.net"}] +license = {text = "GPL-3.0-only"} +# license_files = LICENSE.txt +# long_description = file: README.rst +# long_description_content_type = text/x-rst; charset=UTF-8 +classifiers = [ + "Intended Audience :: Science/Research", + "Development Status :: 4 - Beta", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Information Analysis", +] +requires-python = ">=3.11, <3.12" +dependencies = [ + 'importlib-metadata; python_version<"3.8"', + "pandas", + "matplotlib", + "click", + "scikit-learn", + "imbalanced-learn", + "vaderSentiment", + "xgboost", + "mlxtend", + "spacy", + "textacy", + "torch==2.2.2", + "pypdf", + "requests", + "gensim", + "seaborn", + "wordcloud", +] +dynamic = ["version"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" +# Add here related links, for example: + +[project.urls] +Homepage = "https://github.com/dermatologist/nlp-qrmine" +Documentation = "https://arxiv.org/abs/2003.13519" +# Source = https://github.com/pyscaffold/pyscaffold/ +# Changelog = https://pyscaffold.org/en/latest/changelog.html +# Tracker = https://github.com/pyscaffold/pyscaffold/issues +# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold +# Download = https://pypi.org/project/PyScaffold/#files +# Twitter = https://twitter.com/PyScaffold +# Change if running only on Windows, Mac or Linux (comma-separated) +# Add here all kinds of additional classifiers as defined under +# https://pypi.org/classifiers/ + +[project.optional-dependencies] +# Add here additional requirements for extra features, to install with: +# `pip install qrmine[PDF]` like: +# PDF = ReportLab; RXP +# Add here test requirements (semicolon/line-separated) +testing = [ + "setuptools", + "pytest", + "pytest-cov", +] + +dev = [ + "setuptools", + "setuptools_scm", + "pytest", + "pytest-cov", + "tox", + "black", + "recommonmark", + "sphinx", + "wheel", + "twine", + "tox", +] + +[project.entry-points] +# Add here console scripts like: +# console_scripts = +# script_name = qrmine.module:function +# For example: +# console_scripts = +# fibonacci = qrmine.skeleton:run +# And any other entry points, for example: +# pyscaffold.cli = +# awesome = pyscaffoldext.awesome.extension:AwesomeExtension + +[project.scripts] +qrmine = "qrmine.main:main_routine" + +[tool.setuptools] +zip-safe = false +include-package-data = true +package-dir = {"" = "src"} +# Require a min/specific Python version (comma-separated conditions) +# python_requires = >=3.8 +# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. +# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in +# new major versions. This works if the required packages follow Semantic Versioning. +# For more information, check out https://semver.org/. +platforms = ["any"] + +[tool.setuptools.packages.find] +where = ["src"] +exclude = ["tests"] +namespaces = true + +[tool.pytest.ini_options] +# Specify command line options as you would do when invoking pytest directly. +# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml +# in order to write a coverage file that can be read by Jenkins. +# CAUTION: --cov flags may prohibit setting breakpoints while debugging. +# Comment those flags to avoid this pytest issue. +addopts = """ +--verbose""" +norecursedirs = [ + "dist", + "build", + ".tox", +] + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu" }, +] +torchvision = [ + { index = "pytorch-cpu" }, +] + +[tool.aliases] +release = "sdist bdist_wheel upload" + +[tool.distutils.bdist_wheel] +# Use this option if your package is pure-python +universal = 1 + +[tool.build_sphinx] +source_dir = "docs" +build_dir = "docs/_build" +testpaths = "tests" +# Use pytest markers to select/deselect specific tests +# markers = +# slow: mark tests as slow (deselect with '-m "not slow"') +# system: mark end-to-end system tests + +[tool.devpi.upload] +# Options for the devpi: PyPI server and packaging tool +# VCS export must be deactivated since we are using setuptools-scm +no_vcs = "1" +formats = "bdist_wheel" + +[tool.flake8] +# Some sane defaults for the code style checker flake8 +max_line_length = "88" +extend_ignore = "E203, W503" +# ^ Black-compatible +# E203 and W503 have edge cases handled by black +exclude = """ +.tox +build +dist +.eggs +docs/conf.py""" + +[tool.pyscaffold] +# PyScaffold's parameters when the project was created. +# This will be used when updating. Do not change! +version = "4.6" +package = "qrmine" +# This file is used to configure your project. +# Read more about the various options under: +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html +# https://setuptools.pypa.io/en/latest/references/keywords.html + diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 260d413..0000000 --- a/requirements.txt +++ /dev/null @@ -1,314 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile setup.cfg -o requirements.txt --universal -absl-py==2.1.0 - # via - # tensorboard - # tensorflow -astunparse==1.6.3 - # via tensorflow -blis==0.7.11 - # via thinc -cachetools==5.5.0 - # via - # google-auth - # textacy -catalogue==2.0.10 - # via - # spacy - # srsly - # textacy - # thinc -certifi==2024.8.30 - # via requests -charset-normalizer==3.4.0 - # via requests -click==8.1.7 - # via - # qrmine (setup.cfg) - # typer -cloudpathlib==0.20.0 - # via weasel -colorama==0.4.6 ; sys_platform == 'win32' or platform_system == 'Windows' - # via - # click - # tqdm - # wasabi -confection==0.1.5 - # via - # thinc - # weasel -contourpy==1.3.0 - # via matplotlib -cycler==0.12.1 - # via matplotlib -cymem==2.0.8 - # via - # preshed - # spacy - # thinc -cytoolz==1.0.0 - # via textacy -flatbuffers==24.3.25 - # via tensorflow -floret==0.10.5 - # via textacy -fonttools==4.54.1 - # via matplotlib -gast==0.4.0 - # via tensorflow -google-auth==2.36.0 - # via - # google-auth-oauthlib - # tensorboard -google-auth-oauthlib==1.0.0 - # via tensorboard -google-pasta==0.2.0 - # via tensorflow -grpcio==1.67.1 - # via - # tensorboard - # tensorflow -h5py==3.12.1 - # via tensorflow -idna==3.10 - # via requests -imbalanced-learn==0.12.4 - # via qrmine (setup.cfg) -jellyfish==1.1.0 - # via textacy -jinja2==3.1.6 - # via spacy -joblib==1.4.2 - # via - # imbalanced-learn - # mlxtend - # scikit-learn - # textacy -keras==2.13.1 - # via tensorflow -kiwisolver==1.4.7 - # via matplotlib -langcodes==3.4.1 - # via spacy -language-data==1.2.0 - # via langcodes -libclang==18.1.1 - # via tensorflow -marisa-trie==1.2.1 - # via language-data -markdown==3.7 - # via tensorboard -markdown-it-py==3.0.0 - # via rich -markupsafe==3.0.2 - # via - # jinja2 - # werkzeug -matplotlib==3.9.2 - # via - # qrmine (setup.cfg) - # mlxtend -mdurl==0.1.2 - # via markdown-it-py -mlxtend==0.23.2 - # via qrmine (setup.cfg) -murmurhash==1.0.10 - # via - # preshed - # spacy - # thinc -networkx==3.4.2 - # via textacy -numpy==1.24.3 - # via - # blis - # contourpy - # floret - # h5py - # imbalanced-learn - # matplotlib - # mlxtend - # pandas - # scikit-learn - # scipy - # spacy - # tensorboard - # tensorflow - # textacy - # thinc - # xgboost -nvidia-nccl-cu12==2.23.4 ; platform_machine != 'aarch64' and platform_system == 'Linux' - # via xgboost -oauthlib==3.2.2 - # via requests-oauthlib -opt-einsum==3.4.0 - # via tensorflow -packaging==24.2 - # via - # matplotlib - # spacy - # tensorflow - # thinc - # weasel -pandas==2.1.0 ; python_full_version >= '3.12' - # via - # qrmine (setup.cfg) - # mlxtend -pandas==2.2.3 ; python_full_version < '3.12' - # via - # qrmine (setup.cfg) - # mlxtend -pillow==11.0.0 - # via matplotlib -preshed==3.0.9 - # via - # spacy - # thinc -protobuf==4.25.5 - # via - # tensorboard - # tensorflow -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pydantic==1.10.19 - # via - # confection - # spacy - # thinc - # weasel -pygments==2.18.0 - # via rich -pyparsing==3.2.0 - # via matplotlib -pyphen==0.17.0 - # via textacy -python-dateutil==2.9.0.post0 - # via - # matplotlib - # pandas -pytz==2024.2 - # via pandas -requests==2.32.3 - # via - # requests-oauthlib - # spacy - # tensorboard - # textacy - # vadersentiment - # weasel -requests-oauthlib==2.0.0 - # via google-auth-oauthlib -rich==13.9.4 - # via typer -rsa==4.9 - # via google-auth -scikit-learn==1.5.2 - # via - # qrmine (setup.cfg) - # imbalanced-learn - # mlxtend - # textacy -scipy==1.14.1 - # via - # imbalanced-learn - # mlxtend - # scikit-learn - # textacy - # xgboost -setuptools==75.3.0 - # via - # marisa-trie - # spacy - # tensorboard - # tensorflow - # thinc -shellingham==1.5.4 - # via typer -six==1.16.0 - # via - # astunparse - # google-pasta - # python-dateutil - # tensorflow -smart-open==7.0.5 - # via weasel -spacy==3.7.5 - # via - # qrmine (setup.cfg) - # textacy -spacy-legacy==3.0.12 - # via spacy -spacy-loggers==1.0.5 - # via spacy -srsly==2.4.8 - # via - # confection - # spacy - # thinc - # weasel -tensorboard==2.13.0 - # via tensorflow -tensorboard-data-server==0.7.2 - # via tensorboard -tensorflow==2.13.1 - # via qrmine (setup.cfg) -tensorflow-estimator==2.13.0 - # via tensorflow -tensorflow-io-gcs-filesystem==0.31.0 - # via - # qrmine (setup.cfg) - # tensorflow -termcolor==2.5.0 - # via tensorflow -textacy==0.13.0 - # via qrmine (setup.cfg) -thinc==8.2.5 - # via spacy -threadpoolctl==3.5.0 - # via - # imbalanced-learn - # scikit-learn -toolz==1.0.0 - # via cytoolz -tqdm==4.67.0 - # via - # spacy - # textacy -typer==0.13.0 - # via - # spacy - # weasel -typing-extensions==4.5.0 - # via - # pydantic - # tensorflow - # typer -tzdata==2024.2 - # via pandas -urllib3==2.2.3 - # via requests -vadersentiment==3.3.2 - # via qrmine (setup.cfg) -wasabi==1.1.3 - # via - # spacy - # thinc - # weasel -weasel==0.4.1 - # via spacy -werkzeug==3.1.3 - # via tensorboard -wheel==0.45.0 - # via - # astunparse - # tensorboard -wrapt==1.16.0 - # via - # smart-open - # tensorflow -xgboost==2.1.2 - # via qrmine (setup.cfg) diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index e6953b9..0000000 --- a/setup.cfg +++ /dev/null @@ -1,152 +0,0 @@ -# This file is used to configure your project. -# Read more about the various options under: -# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html -# https://setuptools.pypa.io/en/latest/references/keywords.html - -[metadata] -name = qrmine -description = Qualitative Research support tools in Python! -author = beapen -author_email = github@gulfdoctor.net -license = GPL-3.0-only -# license_files = LICENSE.txt -# long_description = file: README.rst -# long_description_content_type = text/x-rst; charset=UTF-8 -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/dermatologist/nlp-qrmine -# Add here related links, for example: -project_urls = - Documentation = https://arxiv.org/abs/2003.13519 -# Source = https://github.com/pyscaffold/pyscaffold/ -# Changelog = https://pyscaffold.org/en/latest/changelog.html -# Tracker = https://github.com/pyscaffold/pyscaffold/issues -# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold -# Download = https://pypi.org/project/PyScaffold/#files -# Twitter = https://twitter.com/PyScaffold - -# Change if running only on Windows, Mac or Linux (comma-separated) -platforms = any - -# Add here all kinds of additional classifiers as defined under -# https://pypi.org/classifiers/ -classifiers = - Intended Audience :: Science/Research - Development Status :: 4 - Beta - Operating System :: OS Independent - Programming Language :: Python :: 3.11 - Topic :: Scientific/Engineering :: Information Analysis - - -[options] -zip_safe = False -packages = find_namespace: -include_package_data = True -package_dir = - =src - -# Require a min/specific Python version (comma-separated conditions) -# python_requires = >=3.8 - -# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. -# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in -# new major versions. This works if the required packages follow Semantic Versioning. -# For more information, check out https://semver.org/. -install_requires = - importlib-metadata; python_version<"3.8" - pandas - matplotlib - click - scikit-learn - imbalanced-learn - vaderSentiment - xgboost - mlxtend - spacy - textacy - tensorflow<=2.13.1 - tensorflow-io-gcs-filesystem<=0.31.0 - -[options.packages.find] -where = src -exclude = - tests - -[options.extras_require] -# Add here additional requirements for extra features, to install with: -# `pip install qrmine[PDF]` like: -# PDF = ReportLab; RXP - -# Add here test requirements (semicolon/line-separated) -testing = - setuptools - pytest - pytest-cov - -[options.entry_points] -# Add here console scripts like: -# console_scripts = -# script_name = qrmine.module:function -# For example: -# console_scripts = -# fibonacci = qrmine.skeleton:run -# And any other entry points, for example: -# pyscaffold.cli = -# awesome = pyscaffoldext.awesome.extension:AwesomeExtension -console_scripts = - qrmine = qrmine.main:main_routine - -[tool:pytest] -# Specify command line options as you would do when invoking pytest directly. -# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml -# in order to write a coverage file that can be read by Jenkins. -# CAUTION: --cov flags may prohibit setting breakpoints while debugging. -# Comment those flags to avoid this pytest issue. -addopts = - --verbose -norecursedirs = - dist - build - .tox - -[aliases] -release = sdist bdist_wheel upload - -[bdist_wheel] -# Use this option if your package is pure-python -universal = 1 - -[build_sphinx] -source_dir = docs -build_dir = docs/_build - -testpaths = tests -# Use pytest markers to select/deselect specific tests -# markers = -# slow: mark tests as slow (deselect with '-m "not slow"') -# system: mark end-to-end system tests - -[devpi:upload] -# Options for the devpi: PyPI server and packaging tool -# VCS export must be deactivated since we are using setuptools-scm -no_vcs = 1 -formats = bdist_wheel - -[flake8] -# Some sane defaults for the code style checker flake8 -max_line_length = 88 -extend_ignore = E203, W503 -# ^ Black-compatible -# E203 and W503 have edge cases handled by black -exclude = - .tox - build - dist - .eggs - docs/conf.py - -[pyscaffold] -# PyScaffold's parameters when the project was created. -# This will be used when updating. Do not change! -version = 4.6 -package = qrmine diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py index 09a4e35..3549721 100644 --- a/src/qrmine/__init__.py +++ b/src/qrmine/__init__.py @@ -6,6 +6,8 @@ from .readfiles import ReadData from .sentiment import Sentiment from .mlqrmine import MLQRMine +from .cluster import ClusterDocs +from .visualize import QRVisualize if sys.version_info[:2] >= (3, 8): # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py new file mode 100644 index 0000000..3e68ac3 --- /dev/null +++ b/src/qrmine/cluster.py @@ -0,0 +1,189 @@ +""" +Copyright (C) 2025 Bell Eapen + +This file is part of qrmine. + +qrmine is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +qrmine is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with qrmine. If not, see . +""" + +from pprint import pprint + +import pandas as pd +import spacy +from gensim import corpora +from gensim.models.ldamodel import LdaModel + + +class ClusterDocs: + + def __init__(self, documents=[], titles=[]): + self._nlp = spacy.load("en_core_web_sm") + self._documents = documents + self._titles = titles + self._num_topics = 5 + self._passes = 15 + self._dictionary = None + self._corpus = None + self._lda_model = None + # Apply preprocessing to each document + self._processed_docs = [self.preprocess(doc) for doc in documents] + self.process() + + @property + def documents(self): + return self._documents + + @property + def titles(self): + return self._titles + + @property + def num_topics(self): + return self._num_topics + + @property + def passes(self): + return self._passes + + @property + def processed_docs(self): + return self._processed_docs + + @documents.setter + def documents(self, documents): + self._documents = documents + self._processed_docs = [self.preprocess(doc) for doc in documents] + self.process() + + @titles.setter + def titles(self, titles): + self._titles = titles + + @num_topics.setter + def num_topics(self, num_topics): + self._num_topics = num_topics + + @passes.setter + def passes(self, passes): + self._passes = passes + + # Preprocess the documents using spaCy + def preprocess(self, doc): + # Tokenize and preprocess each document + doc = self._nlp(doc) + # Lemmatize and remove stop words + tokens = [token.lemma_ for token in doc if not token.is_stop] + return tokens + + def process(self): + # Create a dictionary representation of the documents + self._dictionary = corpora.Dictionary(self._processed_docs) + # Create a bag-of-words representation of the documents + self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs] + # Build the LDA (Latent Dirichlet Allocation) model + + def build_lda_model(self): + if self._lda_model is None: + self._lda_model = LdaModel( + self._corpus, + num_topics=self._num_topics, + id2word=self._dictionary, + passes=self._passes, + ) + return self._lda_model.show_topics(formatted=False) + + def print_topics(self, num_words=5): + if self._lda_model is None: + self.build_lda_model() + # Print the topics and their corresponding words + pprint(self._lda_model.print_topics(num_words=num_words)) + + def print_clusters(self): + if self._lda_model is None: + self.build_lda_model() + # Perform semantic clustering + for i, doc in enumerate( + self._processed_docs + ): # Changed from get_processed_docs() to _documents + bow = self._dictionary.doc2bow(doc) + print( + f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}" + ) + + def format_topics_sentences(self): + self.build_lda_model() + # Init output + sent_topics_df = pd.DataFrame() + + # Get main topic in each document + for i, row_list in enumerate(self._lda_model[self._corpus]): + row = row_list[0] if self._lda_model.per_word_topics else row_list + # print(row) + row = sorted(row, key=lambda x: (x[1]), reverse=True) + # Get the Dominant topic, Perc Contribution and Keywords for each document + for j, (topic_num, prop_topic) in enumerate(row): + if j == 0: # => dominant topic + wp = self._lda_model.show_topic(topic_num) + topic_keywords = ", ".join([word for word, prop in wp]) + new_row = pd.DataFrame( + [[int(topic_num), round(prop_topic, 4), topic_keywords]], + columns=[ + "Dominant_Topic", + "Perc_Contribution", + "Topic_Keywords", + ], + ) + sent_topics_df = pd.concat( + [sent_topics_df, new_row], ignore_index=True + ) + else: + break + sent_topics_df.columns = [ + "Dominant_Topic", + "Perc_Contribution", + "Topic_Keywords", + ] + + # Add original text to the end of the output + contents = pd.Series(self._processed_docs) + sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) + return sent_topics_df.reset_index(drop=False) + + # https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/ + def most_representative_docs(self): + sent_topics_df = self.format_topics_sentences() + sent_topics_sorteddf_mallet = pd.DataFrame() + sent_topics_outdf_grpd = sent_topics_df.groupby("Dominant_Topic") + + for i, grp in sent_topics_outdf_grpd: + sent_topics_sorteddf_mallet = pd.concat( + [ + sent_topics_sorteddf_mallet, + grp.sort_values(["Perc_Contribution"], ascending=False).head(1), + ], + axis=0, + ) + + return sent_topics_sorteddf_mallet + + def topics_per_document(self, start=0, end=1): + corpus_sel = self._corpus[start:end] + dominant_topics = [] + topic_percentages = [] + for i, corp in enumerate(corpus_sel): + topic_percs = self._lda_model[corp] + dominant_topic = sorted(topic_percs, key=lambda x: x[1], reverse=True)[0][0] + dominant_topics.append((i, dominant_topic)) + topic_percentages.append(topic_percs) + return (dominant_topics, topic_percentages) diff --git a/src/qrmine/content.py b/src/qrmine/content.py index 3344a80..f9e6b0e 100644 --- a/src/qrmine/content.py +++ b/src/qrmine/content.py @@ -87,6 +87,10 @@ def idx(self, token): def doc(self): return self._processed + @property + def tokens(self): + return [token for token in self._processed if not token.is_stop and not token.is_punct and not token.is_space] + def process(self): for token in self._processed: if token.is_stop or token.is_digit or token.is_punct or token.is_space: diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py index 12b75a3..fcfac7a 100644 --- a/src/qrmine/mlqrmine.py +++ b/src/qrmine/mlqrmine.py @@ -1,13 +1,10 @@ import numpy from imblearn.over_sampling import RandomOverSampler -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense -from numpy import random, argsort, sqrt, array, ones from pandas import read_csv from sklearn.cluster import KMeans from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.neighbors import KDTree @@ -17,6 +14,25 @@ from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules +import torch.nn as nn +import torch.optim as optim +import torch +from torch.utils.data import DataLoader, TensorDataset +class NeuralNet(nn.Module): + def __init__(self, input_dim): + super(NeuralNet, self).__init__() + self.fc1 = nn.Linear(input_dim, 12) + self.fc2 = nn.Linear(12, 8) + self.fc3 = nn.Linear(8, 1) + self.relu = nn.ReLU() + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.sigmoid(self.fc3(x)) + return x + class MLQRMine(object): @@ -24,13 +40,13 @@ def __init__(self): self._seed = randint(1, 9) self._csvfile = "" self._titles = None + self._model = None self._dataset = None self._X = None self._y = None self._X_original = None self._y_original = None self._dataset_original = None - self._model = Sequential() self._sc = StandardScaler() self._vnum = 0 # Number of variables self._classifier = XGBClassifier() @@ -147,22 +163,57 @@ def prepare_data(self, oversample=False): self.oversample() def get_nnet_predictions(self): - self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu')) - self._model.add(Dense(8, kernel_initializer='uniform', activation='relu')) - self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) - # Compile model - self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) - # Fit the model - self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2) - - # calculate predictions - predictions = self._model.predict(self._X_original) - # round predictions - rounded = [round(x[0]) for x in predictions] + + self._model = NeuralNet(self._vnum) + criterion = nn.BCELoss() + optimizer = optim.Adam(self._model.parameters(), lr=0.001) + + # Convert data to PyTorch tensors + X_tensor = torch.tensor(self._X, dtype=torch.float32) + y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1) + + # Create a dataset and data loader + dataset = TensorDataset(X_tensor, y_tensor) + dataloader = DataLoader(dataset, batch_size=10, shuffle=True) + + # Train the model + for epoch in range(self._epochs): + for batch_X, batch_y in dataloader: + optimizer.zero_grad() + outputs = self._model(batch_X) + loss = criterion(outputs, batch_y) + loss.backward() + optimizer.step() + + # Calculate predictions + with torch.no_grad(): + predictions = self._model(torch.tensor(self._X_original, dtype=torch.float32)) + rounded = [round(x.item()) for x in predictions] + # print("Predictions: ", rounded) + # Calculate accuracy + correct = sum([1 for i in range(len(rounded)) if rounded[i] == self._y_original[i]]) + total = len(rounded) + accuracy = correct / total + print(f'Accuracy: {accuracy * 100:.2f}%') return rounded def get_nnet_scores(self): - return self._model.evaluate(self._X, self._y) + # evalute the pytorch model + self._model.eval() + X_tensor = torch.tensor(self._X, dtype=torch.float32) + y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1) + dataset = TensorDataset(X_tensor, y_tensor) + dataloader = DataLoader(dataset, batch_size=10, shuffle=True) + correct = 0 + total = 0 + with torch.no_grad(): + for batch_X, batch_y in dataloader: + outputs = self._model(batch_X) + predicted = (outputs > 0.5).float() + total += batch_y.size(0) + correct += (predicted == batch_y).sum().item() + accuracy = correct / total + print(f'Accuracy: {accuracy * 100:.2f}%') def svm_confusion_matrix(self): """Generate confusion matrix for SVM @@ -211,7 +262,6 @@ def get_centroids(self, c=1): print("Mean") print(self._dataset.iloc[cluster_list, :].mean(axis=0)) - """ TODO: This is not working yet. use the ColumnTransformer instead of categorical_features diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py index a460795..a213ff7 100644 --- a/src/qrmine/readfiles.py +++ b/src/qrmine/readfiles.py @@ -1,5 +1,6 @@ import re - +import requests +from pypdf import PdfReader class ReadData(object): def __init__(self): @@ -37,22 +38,10 @@ def append(self, title, document): self._documents.append(document) self._content += document - def read_file(self, file_names): - if len(file_names) > 1: - for file_name in file_names: - with open(file_name, 'r') as f: - read_from_file = f.read() - self._content = re.sub('<[^<]+?>', '', read_from_file) - self._documents = re.split('.*?', read_from_file) - # Delete the last blank record - del self._documents[-1] - pattern = r"(.*?)" - _title = re.findall(pattern, read_from_file, flags=re.DOTALL)[0] - self._titles.append(_title) - f.close() - else: - file_name = file_names[0] - with open(file_name, 'r') as f: + def read_file(self, input): + # if input is a file name + if isinstance(input, str): + with open(input, 'r') as f: read_from_file = f.read() self._content = re.sub('<[^<]+?>', '', read_from_file) self._documents = re.split('.*?', read_from_file) @@ -60,25 +49,51 @@ def read_file(self, file_names): del self._documents[-1] pattern = r"(.*?)" self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL) + # if input is a folder name + elif isinstance(input, str): + import os + for file_name in os.listdir(input): + if file_name.endswith('.txt'): + with open(os.path.join(input, file_name), 'r') as f: + read_from_file = f.read() + self._content += read_from_file + self._documents.append(read_from_file) + self.titles.append(file_name) + if file_name.endswith('.pdf'): + with open(os.path.join(input, file_name), 'rb') as f: + reader = PdfReader(f) + read_from_file = "" + for page in reader.pages: + read_from_file += page.extract_text() + self._content += read_from_file + self._documents.append(read_from_file) + self.titles.append(file_name) + # if input is a url + elif isinstance(input, str): + response = requests.get(input) + if response.status_code == 200: + read_from_file = response.text + self._content = read_from_file + self._documents.append(read_from_file) + self.titles.append(input) + else: + raise ValueError("Input must be a file name, folder name or url.") - """ - Combine duplicate topics using Dict - Currently supported only for single file. - """ - - doc_dict = {} - ct3 = 0 - for t in self._titles: - doc = doc_dict.get(t) - if doc: - doc_dict[t] = doc + self._documents[ct3] - else: - doc_dict[t] = self._documents[ct3] - ct3 += 1 - self._titles.clear() - self._documents.clear() - for t in doc_dict.keys(): - self._documents.append(doc_dict.get(t)) - self._titles.append(t) + """ + Combine duplicate topics using Dict + """ - f.close() + doc_dict = {} + ct3 = 0 + for t in self._titles: + doc = doc_dict.get(t) + if doc: + doc_dict[t] = doc + self._documents[ct3] + else: + doc_dict[t] = self._documents[ct3] + ct3 += 1 + self._titles.clear() + self._documents.clear() + for t in doc_dict.keys(): + self._documents.append(doc_dict.get(t)) + self._titles.append(t) diff --git a/src/qrmine/resources/df_dominant_topic.csv b/src/qrmine/resources/df_dominant_topic.csv new file mode 100644 index 0000000..115eb63 --- /dev/null +++ b/src/qrmine/resources/df_dominant_topic.csv @@ -0,0 +1,12 @@ +,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text +0,0,4,0.9903,"., GT, Strauss, ,, coding, +, ), Theory, seminal, (","['ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']" +1,1,1,0.7811,",, theory, ., GT, evaluation, structure, coding, +, ), (","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']" +2,2,1,0.9783,",, theory, ., GT, evaluation, structure, coding, +, ), (","['\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n']" +3,3,3,0.9952,"., ,, coding, category, open, QRMine, datum, researcher, code, GT","['\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n']" +4,4,4,0.9793,"., GT, Strauss, ,, coding, +, ), Theory, seminal, (","['\n', 'ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n']" +5,5,2,0.9712,"category, comparison, incident, ,, +, involve, refine, identify, emergence, constant","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n']" diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py new file mode 100644 index 0000000..4a7fc25 --- /dev/null +++ b/src/qrmine/visualize.py @@ -0,0 +1,390 @@ +""" +Copyright (C) 2025 Bell Eapen + +This file is part of qrmine. + +qrmine is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +qrmine is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with qrmine. If not, see . +""" + +from collections import Counter + +import matplotlib.colors as mcolors +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from matplotlib.patches import Rectangle +from matplotlib.ticker import FuncFormatter +from sklearn.manifold import TSNE +from wordcloud import STOPWORDS, WordCloud + + +class QRVisualize: + def __init__(self, data: pd.DataFrame = None): + """ + Initialize the QRVisualize class with a DataFrame. + + Parameters: + data (pd.DataFrame): The DataFrame containing the data to visualize. + """ + self.data = data + + def plot_frequency_distribution_of_words(self, df=None, folder_path=None): + if df is None: + df = self.data + doc_lens = [len(d) for d in df.Text] + + # Plot + plt.figure(figsize=(16, 7), dpi=160) + plt.hist(doc_lens, bins=1000, color="navy") + plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens)))) + plt.text(750, 90, "Median : " + str(round(np.median(doc_lens)))) + plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens)))) + plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01)))) + plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99)))) + + plt.gca().set( + xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count" + ) + plt.tick_params(size=16) + plt.xticks(np.linspace(0, 1000, 9)) + plt.title("Distribution of Document Word Counts", fontdict=dict(size=22)) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def plot_distribution_by_topic(self, df=None, folder_path=None): + if df is None: + df = self.data + # Plot + cols = [ + color for name, color in mcolors.TABLEAU_COLORS.items() + ] # more colors: 'mcolors.XKCD_COLORS' + + fig, axes = plt.subplots( + 2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True + ) + + for i, ax in enumerate(axes.flatten()): + df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :] + doc_lens = [len(d) for d in df_dominant_topic_sub.Text] + ax.hist(doc_lens, bins=1000, color=cols[i]) + ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i]) + sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx()) + ax.set(xlim=(0, 1000), xlabel="Document Word Count") + ax.set_ylabel("Number of Documents", color=cols[i]) + ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i])) + + fig.tight_layout() + fig.subplots_adjust(top=0.90) + plt.xticks(np.linspace(0, 1000, 9)) + fig.suptitle( + "Distribution of Document Word Counts by Dominant Topic", fontsize=22 + ) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def plot_wordcloud(self, topics=None, folder_path=None): + cols = [ + color for name, color in mcolors.TABLEAU_COLORS.items() + ] # more colors: 'mcolors.XKCD_COLORS' + + cloud = WordCloud( + stopwords=STOPWORDS, + background_color="white", + width=250, + height=180, + max_words=5, + colormap="tab10", + color_func=lambda *args, **kwargs: cols[i], + prefer_horizontal=1.0, + ) + + fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True) + + for i, ax in enumerate(axes.flatten()): + fig.add_subplot(ax) + topic_words = dict(topics[i][1]) + cloud.generate_from_frequencies(topic_words, max_font_size=300) + plt.gca().imshow(cloud) + plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16)) + plt.gca().axis("off") + + plt.subplots_adjust(wspace=0, hspace=0) + plt.axis("off") + plt.margins(x=0, y=0) + plt.tight_layout() + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def plot_importance(self, topics=None, processed_docs=None, folder_path=None): + data_flat = [w for w_list in processed_docs for w in w_list] + counter = Counter(data_flat) + + out = [] + for i, topic in topics: + for word, weight in topic: + out.append([word, i, weight, counter[word]]) + + df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"]) + + # Plot Word Count and Weights of Topic Keywords + fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160) + cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] + for i, ax in enumerate(axes.flatten()): + ax.bar( + x="word", + height="word_count", + data=df.loc[df.topic_id == i, :], + color=cols[i], + width=0.5, + alpha=0.3, + label="Word Count", + ) + ax_twin = ax.twinx() + ax_twin.bar( + x="word", + height="importance", + data=df.loc[df.topic_id == i, :], + color=cols[i], + width=0.2, + label="Weights", + ) + ax.set_ylabel("Word Count", color=cols[i]) + ax_twin.set_ylim(0, 0.030) + ax.set_ylim(0, 3500) + ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16) + ax.tick_params(axis="y", left=False) + ax.set_xticklabels( + df.loc[df.topic_id == i, "word"], + rotation=30, + horizontalalignment="right", + ) + ax.legend(loc="upper left") + ax_twin.legend(loc="upper right") + + fig.tight_layout(w_pad=2) + fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13): + corp = corpus[start:end] + mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()] + + fig, axes = plt.subplots( + end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160 + ) + axes[0].axis("off") + for i, ax in enumerate(axes): + if i > 0: + corp_cur = corp[i - 1] + topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur] + word_dominanttopic = [ + (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics + ] + ax.text( + 0.01, + 0.5, + "Doc " + str(i - 1) + ": ", + verticalalignment="center", + fontsize=16, + color="black", + transform=ax.transAxes, + fontweight=700, + ) + + # Draw Rectange + topic_percs_sorted = sorted( + topic_percs, key=lambda x: (x[1]), reverse=True + ) + ax.add_patch( + Rectangle( + (0.0, 0.05), + 0.99, + 0.90, + fill=None, + alpha=1, + color=mycolors[topic_percs_sorted[0][0]], + linewidth=2, + ) + ) + + word_pos = 0.06 + for j, (word, topics) in enumerate(word_dominanttopic): + if j < 14: + ax.text( + word_pos, + 0.5, + word, + horizontalalignment="left", + verticalalignment="center", + fontsize=16, + color=mycolors[topics], + transform=ax.transAxes, + fontweight=700, + ) + word_pos += 0.009 * len( + word + ) # to move the word for the next iter + ax.axis("off") + ax.text( + word_pos, + 0.5, + ". . .", + horizontalalignment="left", + verticalalignment="center", + fontsize=16, + color="black", + transform=ax.transAxes, + ) + + plt.subplots_adjust(wspace=0, hspace=0) + plt.suptitle( + "Sentence Topic Coloring for Documents: " + + str(start) + + " to " + + str(end - 2), + fontsize=22, + y=0.95, + fontweight=700, + ) + plt.tight_layout() + plt.show() + + def cluster_chart(self, lda_model=None, corpus=None, n_topics=4, folder_path=None): + # Get topic weights + topic_weights = [] + for i, row_list in enumerate(lda_model[corpus]): + topic_weights.append([w for i, w in row_list[0]]) + + # Array of topic weights + arr = pd.DataFrame(topic_weights).fillna(0).values + + # Keep the well separated points (optional) + arr = arr[np.amax(arr, axis=1) > 0.35] + + # Dominant topic number in each doc + topic_num = np.argmax(arr, axis=1) + + # tSNE Dimension Reduction + tsne_model = TSNE( + n_components=2, verbose=1, random_state=0, angle=0.99, init="pca" + ) + tsne_lda = tsne_model.fit_transform(arr) + + # Plot + plt.figure(figsize=(16, 10), dpi=160) + for i in range(n_topics): + plt.scatter( + tsne_lda[topic_num == i, 0], + tsne_lda[topic_num == i, 1], + label=str(i), + alpha=0.5, + ) + plt.title("t-SNE Clustering of Topics", fontsize=22) + plt.xlabel("t-SNE Dimension 1", fontsize=16) + plt.ylabel("t-SNE Dimension 2", fontsize=16) + plt.legend(title="Topic Number", loc="upper right") + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def most_discussed_topics( + self, lda_model, dominant_topics, topic_percentages, folder_path=None + ): + + # Distribution of Dominant Topics in Each Document + df = pd.DataFrame(dominant_topics, columns=["Document_Id", "Dominant_Topic"]) + dominant_topic_in_each_doc = df.groupby("Dominant_Topic").size() + df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame( + name="count" + ).reset_index() + + # Total Topic Distribution by actual weight + topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages]) + df_topic_weightage_by_doc = ( + topic_weightage_by_doc.sum().to_frame(name="count").reset_index() + ) + + # Top 3 Keywords for each Topic + topic_top3words = [ + (i, topic) + for i, topics in lda_model.show_topics(formatted=False) + for j, (topic, wt) in enumerate(topics) + if j < 3 + ] + + df_top3words_stacked = pd.DataFrame( + topic_top3words, columns=["topic_id", "words"] + ) + df_top3words = df_top3words_stacked.groupby("topic_id").agg(", \n".join) + df_top3words.reset_index(level=0, inplace=True) + + # Plot + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True) + + # Topic Distribution by Dominant Topics + ax1.bar( + x="Dominant_Topic", + height="count", + data=df_dominant_topic_in_each_doc, + width=0.5, + color="firebrick", + ) + ax1.set_xticks( + range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()) + ) + tick_formatter = FuncFormatter( + lambda x, pos: "Topic " + + str(x) + + "\n" + + df_top3words.loc[df_top3words.topic_id == x, "words"].values[0] + ) + ax1.xaxis.set_major_formatter(tick_formatter) + ax1.set_title("Number of Documents by Dominant Topic", fontdict=dict(size=10)) + ax1.set_ylabel("Number of Documents") + ax1.set_ylim(0, 1000) + + # Topic Distribution by Topic Weights + ax2.bar( + x="index", + height="count", + data=df_topic_weightage_by_doc, + width=0.5, + color="steelblue", + ) + ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__())) + ax2.xaxis.set_major_formatter(tick_formatter) + ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10)) + + plt.show() + + # save + if folder_path: + plt.savefig(folder_path) + plt.close() diff --git a/test.py b/test.py new file mode 100644 index 0000000..a5c4b31 --- /dev/null +++ b/test.py @@ -0,0 +1,33 @@ +import spacy + +# Load spaCy model +nlp = spacy.load("en_core_web_sm") + +# Sample documents +documents = [ + "Natural language processing is a field of AI.", + "Topic modeling helps in uncovering the main themes in a collection of documents.", + "Semantic clustering groups similar documents together based on meaning.", + "SpaCy is a popular NLP library.", + "Gensim is commonly used for topic modeling.", +] + + +# Preprocess the documents using spaCy +def preprocess(doc): + # Tokenize and preprocess each document + doc = nlp(doc) + print(f"Original Document: {doc}") + # Lemmatize and remove stop words + tokens = [token.lemma_ for token in doc if not token.is_stop] + print(f"Processed Tokens: {tokens}") + return tokens + + +# Apply preprocessing to each document +processed_docs = [preprocess(doc) for doc in documents] + + +# Print the processed documents +for i, doc in enumerate(processed_docs): + print(f"Document {i + 1}: {doc}") \ No newline at end of file diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 4ad331d..6c922a5 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -1,32 +1,45 @@ import pytest - @pytest.fixture def corpus_fixture(): from pkg_resources import resource_filename from src.qrmine import ReadData + corpus = ReadData() - file_path = resource_filename('src.qrmine.resources', 'interview.txt') - corpus.read_file([file_path]) + file_path = resource_filename("src.qrmine.resources", "interview.txt") + corpus.read_file(file_path) return corpus + # instannce of Qrmine as fixture @pytest.fixture def q(): from src.qrmine import Qrmine + _q = Qrmine() return _q + +@pytest.fixture +def cluster(): + from src.qrmine import ClusterDocs + + _cluster = ClusterDocs() + return _cluster + + # Ref: https://docs.pytest.org/en/latest/capture.html def test_generate_dict(corpus_fixture, capsys, q): from src.qrmine import Content + num = 10 all_interviews = Content(corpus_fixture.content) q.print_dict(all_interviews, num) captured = capsys.readouterr() print(captured.out) - assert 'code' in captured.out + assert "code" in captured.out + def test_generate_topics(corpus_fixture, capsys, q): q.content = corpus_fixture @@ -34,22 +47,53 @@ def test_generate_topics(corpus_fixture, capsys, q): q.print_topics() captured = capsys.readouterr() print(captured.out) - assert 'TOPIC' in captured.out + assert "TOPIC" in captured.out + def test_category_basket(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_basket()) captured = capsys.readouterr() print(captured.out) - assert 'theory' in captured.out + assert "theory" in captured.out + def test_category_association(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_association()) captured = capsys.readouterr() print(captured.out) - assert 'theory' in captured.out + assert "theory" in captured.out + +def test_cluster_topics(corpus_fixture, capsys, cluster): + cluster.documents = corpus_fixture.documents + cluster.titles = corpus_fixture.titles + cluster.print_clusters() + captured = capsys.readouterr() + print(captured.out) + assert "Document" in captured.out + cluster.print_topics() + captured = capsys.readouterr() + print(captured.out) + assert "topic" in captured.out + print(cluster.build_lda_model()) + print(cluster.topics_per_document()) + # Format + df_dominant_topic = cluster.format_topics_sentences() + # Format the output + df_dominant_topic.columns = [ + "Document_No", + "Dominant_Topic", + "Topic_Perc_Contrib", + "Keywords", + "Text", + ] + print(df_dominant_topic.head(10)) + assert "Document_No" in df_dominant_topic.columns + df_sorted = cluster.most_representative_docs() + print(df_sorted.head(10)) + assert "Dominant_Topic" in df_sorted.columns diff --git a/tests/test_num.py b/tests/test_num.py index f0c53cd..ac7a139 100644 --- a/tests/test_num.py +++ b/tests/test_num.py @@ -9,7 +9,7 @@ def ml_fixture(): ml = MLQRMine() file_path = resource_filename('src.qrmine.resources', 'numeric.csv') ml.csvfile = file_path - return ml + return ml @@ -19,7 +19,7 @@ def test_nn(ml_fixture, capsys): ml_fixture.prepare_data(True) ml_fixture.get_nnet_predictions() captured = capsys.readouterr() - assert 'accuracy' in captured.out + assert 'Accuracy' in captured.out def test_svm(ml_fixture, capsys): ml_fixture.prepare_data(True) diff --git a/tests/test_readfiles.py b/tests/test_readfiles.py index aff3a5d..963ed90 100644 --- a/tests/test_readfiles.py +++ b/tests/test_readfiles.py @@ -8,8 +8,8 @@ def corpus_fixture(): from src.qrmine import ReadData corpus = ReadData() file_path = resource_filename('src.qrmine.resources', 'interview.txt') - corpus.read_file([file_path]) - return corpus + corpus.read_file(file_path) + return corpus def test_content(corpus_fixture): diff --git a/tests/test_visualize.py b/tests/test_visualize.py new file mode 100644 index 0000000..41f7145 --- /dev/null +++ b/tests/test_visualize.py @@ -0,0 +1,114 @@ +import pytest +import pandas as pd +from src.qrmine.visualize import QRVisualize + + +@pytest.fixture +def v(): + from pkg_resources import resource_filename + + file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv") + data = pd.read_csv(file_path) + _v = QRVisualize(data) + return _v + + +@pytest.fixture +def topics(): + return [ + ( + 0, + [ + (".", 0.095292516), + (",", 0.053392828), + ("category", 0.032462463), + ("coding", 0.032456465), + ("open", 0.032437164), + ("QRMine", 0.03243305), + ("datum", 0.021980358), + ("researcher", 0.021978099), + ("theory", 0.011536299), + ("GT", 0.011533132), + ], + ), + ( + 1, + [ + (".", 0.007783216), + (",", 0.007773952), + ("open", 0.007728422), + ("researcher", 0.0077227736), + ("coding", 0.007722049), + ("category", 0.007721938), + ("datum", 0.007717547), + ("QRMine", 0.007716193), + ("dissect", 0.0077070068), + ("support", 0.0077060354), + ], + ), + ( + 2, + [ + (",", 0.05126711), + (".", 0.05125151), + ("theory", 0.038604487), + ("category", 0.03227912), + ("GT", 0.032278605), + ("\n", 0.029119665), + ("comparison", 0.025947908), + ("coding", 0.025941858), + ("incident", 0.019622542), + (")", 0.019619444), + ], + ), + ( + 3, + [ + (".", 0.007849805), + (",", 0.007837688), + ("theory", 0.00781459), + ("coding", 0.0078089647), + ("category", 0.0077514737), + ("GT", 0.0077493717), + ("datum", 0.007742789), + ("open", 0.0077355755), + ("\n", 0.0077245855), + ("researcher", 0.0077191954), + ], + ), + ( + 4, + [ + (",", 0.007834569), + (".", 0.007812336), + ("coding", 0.0077863215), + ("category", 0.007759207), + ("theory", 0.0077459146), + ("GT", 0.0077370973), + ("code", 0.0077265715), + ("datum", 0.007720947), + ("open", 0.007720898), + ("comparison", 0.007720567), + ], + ), + ] + + +def test_frequency_distribution_of_words(v, capsys): + v.plot_frequency_distribution_of_words( + v.data + ) + captured = capsys.readouterr() + print(captured.out) + + +def test_distribution_by_topic(v, capsys): + v.plot_distribution_by_topic(v.data) + captured = capsys.readouterr() + print(captured.out) + + +def test_plot_wordcloud(v, topics, capsys): + v.plot_wordcloud(topics) + captured = capsys.readouterr() + print(captured.out) diff --git a/tox.ini b/tox.ini index 3eb707d..dbb293d 100644 --- a/tox.ini +++ b/tox.ini @@ -8,9 +8,6 @@ envlist = py311, integration [testenv] setenv = TOXINIDIR = {toxinidir} -deps = - -rrequirements.txt - -rdev-requirements.txt commands = python -m spacy download en_core_web_sm py.test {posargs} @@ -20,9 +17,6 @@ extras = [testenv:integration] setenv = TOXINIDIR = {toxinidir} -deps = - -rrequirements.txt - -rdev-requirements.txt commands = python -m spacy download en_core_web_sm python qrminer.py \ No newline at end of file