diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index f742724..3693dc2 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -13,27 +13,27 @@ jobs:
strategy:
max-parallel: 4
matrix:
- python-version: ["3.11"]
os: [ubuntu-latest, macos-13, windows-latest]
runs-on: ${{ matrix.os }}
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v4
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
with:
- python-version: ${{ matrix.python-version }}
- cache: 'pip' # caching pip dependencies
+ enable-cache: true
+ - name: "Set up Python"
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: "pyproject.toml"
- name: run on mac
if: startsWith(matrix.os, 'mac')
run: |
brew install libomp
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install -r requirements.txt
- python -m spacy download en_core_web_sm
- - name: Test with pytest
+ - name: Install the project
run: |
- pip install pytest
- pytest
+ uv sync --all-extras --dev
+ uv pip install pip
+ uv run python -m spacy download en_core_web_sm
+ - name: Run tests
+ run: uv run pytest tests
diff --git a/.gitignore b/.gitignore
index 64049e7..c29a2a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ __pycache__/*
.idea
.venv
conda
+uv.lock
# Package files
*.egg
diff --git a/dev-requirements.in b/dev-requirements.in
deleted file mode 100644
index 2b56355..0000000
--- a/dev-requirements.in
+++ /dev/null
@@ -1,11 +0,0 @@
-# dev-requirements.in
--c requirements.txt
-pytest-cov
-pytest
-recommonmark
-sphinx>=3.2.1
-setuptools
-setuptools_scm
-wheel>=0.37.0 # conflicts with dependency of tensorflow
-tox
-pip-tools
\ No newline at end of file
diff --git a/dev-requirements.txt b/dev-requirements.txt
deleted file mode 100644
index f36f95c..0000000
--- a/dev-requirements.txt
+++ /dev/null
@@ -1,146 +0,0 @@
-# This file was autogenerated by uv via the following command:
-# uv pip compile dev-requirements.in -o dev-requirements.txt --universal
-alabaster==1.0.0
- # via sphinx
-babel==2.16.0
- # via sphinx
-build==1.2.2.post1
- # via pip-tools
-cachetools==5.5.0
- # via
- # -c requirements.txt
- # tox
-certifi==2024.8.30
- # via
- # -c requirements.txt
- # requests
-chardet==5.2.0
- # via tox
-charset-normalizer==3.4.0
- # via
- # -c requirements.txt
- # requests
-click==8.1.7
- # via
- # -c requirements.txt
- # pip-tools
-colorama==0.4.6
- # via
- # -c requirements.txt
- # build
- # click
- # pytest
- # sphinx
- # tox
-commonmark==0.9.1
- # via recommonmark
-coverage==7.6.4
- # via pytest-cov
-distlib==0.3.9
- # via virtualenv
-docutils==0.21.2
- # via
- # recommonmark
- # sphinx
-filelock==3.16.1
- # via
- # tox
- # virtualenv
-idna==3.10
- # via
- # -c requirements.txt
- # requests
-imagesize==1.4.1
- # via sphinx
-iniconfig==2.0.0
- # via pytest
-jinja2==3.1.4
- # via
- # -c requirements.txt
- # sphinx
-markupsafe==3.0.2
- # via
- # -c requirements.txt
- # jinja2
-packaging==24.2
- # via
- # -c requirements.txt
- # build
- # pyproject-api
- # pytest
- # setuptools-scm
- # sphinx
- # tox
-pip==24.3.1
- # via pip-tools
-pip-tools==7.4.1
- # via -r dev-requirements.in
-platformdirs==4.3.6
- # via
- # tox
- # virtualenv
-pluggy==1.5.0
- # via
- # pytest
- # tox
-pygments==2.18.0
- # via
- # -c requirements.txt
- # sphinx
-pyproject-api==1.8.0
- # via tox
-pyproject-hooks==1.2.0
- # via
- # build
- # pip-tools
-pytest==8.3.3
- # via
- # -r dev-requirements.in
- # pytest-cov
-pytest-cov==6.0.0
- # via -r dev-requirements.in
-recommonmark==0.7.1
- # via -r dev-requirements.in
-requests==2.32.3
- # via
- # -c requirements.txt
- # sphinx
-setuptools==75.3.0
- # via
- # -c requirements.txt
- # -r dev-requirements.in
- # pip-tools
- # setuptools-scm
-setuptools-scm==8.1.0
- # via -r dev-requirements.in
-snowballstemmer==2.2.0
- # via sphinx
-sphinx==8.1.3
- # via
- # -r dev-requirements.in
- # recommonmark
-sphinxcontrib-applehelp==2.0.0
- # via sphinx
-sphinxcontrib-devhelp==2.0.0
- # via sphinx
-sphinxcontrib-htmlhelp==2.1.0
- # via sphinx
-sphinxcontrib-jsmath==1.0.1
- # via sphinx
-sphinxcontrib-qthelp==2.0.0
- # via sphinx
-sphinxcontrib-serializinghtml==2.0.0
- # via sphinx
-tox==4.23.2
- # via -r dev-requirements.in
-urllib3==2.2.3
- # via
- # -c requirements.txt
- # requests
-virtualenv==20.27.1
- # via tox
-wheel==0.45.0
- # via
- # -c requirements.txt
- # -r dev-requirements.in
- # pip-tools
diff --git a/notes/conda.md b/notes/conda.md
new file mode 100644
index 0000000..79eb6c8
--- /dev/null
+++ b/notes/conda.md
@@ -0,0 +1,12 @@
+conda create --name qrmine python=3.11
+conda activate qrmine
+
+conda install conda-forge::uv
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
+uv pip install -e .
+python -m spacy download en_core_web_sm
+
+
+
+pip3 install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
\ No newline at end of file
diff --git a/notes/new-process.md b/notes/new-process.md
new file mode 100644
index 0000000..1ead749
--- /dev/null
+++ b/notes/new-process.md
@@ -0,0 +1,34 @@
+conda install conda-forge::uv
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
+
+delete setup.cpg
+delete requirements.txt, dev-requirements.txt, dev-requirements.in
+remove deps from tox.ini
+
+uv pip install -e .
+see pr.yml for GitHub actions
+see pyproject.toml for pytorch cpu install
+uv pip install -e .
+
+uv sync --all-extras --dev
+uv pip install pip
+uv run python -m spacy download en_core_web_sm
+
+pyproject.toml
+requires = ["setuptools>=61.2", "wheel", "pip"]
+
+dev = [
+ "setuptools",
+ "setuptools_scm",
+ "pytest",
+ "pytest-cov",
+ "tox",
+ "black",
+ "recommonmark",
+ "sphinx",
+ "wheel",
+ "twine",
+ "tox",
+]
+
diff --git a/notes/pip-tools.md b/notes/pip-tools.md
index da4baa4..c504a1e 100644
--- a/notes/pip-tools.md
+++ b/notes/pip-tools.md
@@ -21,4 +21,7 @@ OR
* pip install uv
* uv pip compile setup.cfg -o requirements.txt --universal
-* uv pip compile dev-requirements.in -o dev-requirements.txt --universal
\ No newline at end of file
+* uv pip compile dev-requirements.in -o dev-requirements.txt --universal
+
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 89a5bed..9fc3688 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,187 @@
[build-system]
-# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD!
-requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"]
+requires = ["setuptools>=61.2", "wheel", "pip"]
build-backend = "setuptools.build_meta"
-[tool.setuptools_scm]
-# For smarter version schemes and other configuration options,
-# check out https://github.com/pypa/setuptools_scm
-version_scheme = "no-guess-dev"
+[project]
+name = "qrmine"
+description = "Qualitative Research support tools in Python!"
+authors = [{name = "beapen", email = "github@gulfdoctor.net"}]
+license = {text = "GPL-3.0-only"}
+# license_files = LICENSE.txt
+# long_description = file: README.rst
+# long_description_content_type = text/x-rst; charset=UTF-8
+classifiers = [
+ "Intended Audience :: Science/Research",
+ "Development Status :: 4 - Beta",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3.11",
+ "Topic :: Scientific/Engineering :: Information Analysis",
+]
+requires-python = ">=3.11, <3.12"
+dependencies = [
+ 'importlib-metadata; python_version<"3.8"',
+ "pandas",
+ "matplotlib",
+ "click",
+ "scikit-learn",
+ "imbalanced-learn",
+ "vaderSentiment",
+ "xgboost",
+ "mlxtend",
+ "spacy",
+ "textacy",
+ "torch==2.2.2",
+ "pypdf",
+ "requests",
+ "gensim",
+ "seaborn",
+ "wordcloud",
+]
+dynamic = ["version"]
+
+[project.readme]
+file = "README.md"
+content-type = "text/markdown"
+# Add here related links, for example:
+
+[project.urls]
+Homepage = "https://github.com/dermatologist/nlp-qrmine"
+Documentation = "https://arxiv.org/abs/2003.13519"
+# Source = https://github.com/pyscaffold/pyscaffold/
+# Changelog = https://pyscaffold.org/en/latest/changelog.html
+# Tracker = https://github.com/pyscaffold/pyscaffold/issues
+# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
+# Download = https://pypi.org/project/PyScaffold/#files
+# Twitter = https://twitter.com/PyScaffold
+# Change if running only on Windows, Mac or Linux (comma-separated)
+# Add here all kinds of additional classifiers as defined under
+# https://pypi.org/classifiers/
+
+[project.optional-dependencies]
+# Add here additional requirements for extra features, to install with:
+# `pip install qrmine[PDF]` like:
+# PDF = ReportLab; RXP
+# Add here test requirements (semicolon/line-separated)
+testing = [
+ "setuptools",
+ "pytest",
+ "pytest-cov",
+]
+
+dev = [
+ "setuptools",
+ "setuptools_scm",
+ "pytest",
+ "pytest-cov",
+ "tox",
+ "black",
+ "recommonmark",
+ "sphinx",
+ "wheel",
+ "twine",
+ "tox",
+]
+
+[project.entry-points]
+# Add here console scripts like:
+# console_scripts =
+# script_name = qrmine.module:function
+# For example:
+# console_scripts =
+# fibonacci = qrmine.skeleton:run
+# And any other entry points, for example:
+# pyscaffold.cli =
+# awesome = pyscaffoldext.awesome.extension:AwesomeExtension
+
+[project.scripts]
+qrmine = "qrmine.main:main_routine"
+
+[tool.setuptools]
+zip-safe = false
+include-package-data = true
+package-dir = {"" = "src"}
+# Require a min/specific Python version (comma-separated conditions)
+# python_requires = >=3.8
+# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
+# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
+# new major versions. This works if the required packages follow Semantic Versioning.
+# For more information, check out https://semver.org/.
+platforms = ["any"]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+exclude = ["tests"]
+namespaces = true
+
+[tool.pytest.ini_options]
+# Specify command line options as you would do when invoking pytest directly.
+# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
+# in order to write a coverage file that can be read by Jenkins.
+# CAUTION: --cov flags may prohibit setting breakpoints while debugging.
+# Comment those flags to avoid this pytest issue.
+addopts = """
+--verbose"""
+norecursedirs = [
+ "dist",
+ "build",
+ ".tox",
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+ { index = "pytorch-cpu" },
+]
+torchvision = [
+ { index = "pytorch-cpu" },
+]
+
+[tool.aliases]
+release = "sdist bdist_wheel upload"
+
+[tool.distutils.bdist_wheel]
+# Use this option if your package is pure-python
+universal = 1
+
+[tool.build_sphinx]
+source_dir = "docs"
+build_dir = "docs/_build"
+testpaths = "tests"
+# Use pytest markers to select/deselect specific tests
+# markers =
+# slow: mark tests as slow (deselect with '-m "not slow"')
+# system: mark end-to-end system tests
+
+[tool.devpi.upload]
+# Options for the devpi: PyPI server and packaging tool
+# VCS export must be deactivated since we are using setuptools-scm
+no_vcs = "1"
+formats = "bdist_wheel"
+
+[tool.flake8]
+# Some sane defaults for the code style checker flake8
+max_line_length = "88"
+extend_ignore = "E203, W503"
+# ^ Black-compatible
+# E203 and W503 have edge cases handled by black
+exclude = """
+.tox
+build
+dist
+.eggs
+docs/conf.py"""
+
+[tool.pyscaffold]
+# PyScaffold's parameters when the project was created.
+# This will be used when updating. Do not change!
+version = "4.6"
+package = "qrmine"
+# This file is used to configure your project.
+# Read more about the various options under:
+# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
+# https://setuptools.pypa.io/en/latest/references/keywords.html
+
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 260d413..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,314 +0,0 @@
-# This file was autogenerated by uv via the following command:
-# uv pip compile setup.cfg -o requirements.txt --universal
-absl-py==2.1.0
- # via
- # tensorboard
- # tensorflow
-astunparse==1.6.3
- # via tensorflow
-blis==0.7.11
- # via thinc
-cachetools==5.5.0
- # via
- # google-auth
- # textacy
-catalogue==2.0.10
- # via
- # spacy
- # srsly
- # textacy
- # thinc
-certifi==2024.8.30
- # via requests
-charset-normalizer==3.4.0
- # via requests
-click==8.1.7
- # via
- # qrmine (setup.cfg)
- # typer
-cloudpathlib==0.20.0
- # via weasel
-colorama==0.4.6 ; sys_platform == 'win32' or platform_system == 'Windows'
- # via
- # click
- # tqdm
- # wasabi
-confection==0.1.5
- # via
- # thinc
- # weasel
-contourpy==1.3.0
- # via matplotlib
-cycler==0.12.1
- # via matplotlib
-cymem==2.0.8
- # via
- # preshed
- # spacy
- # thinc
-cytoolz==1.0.0
- # via textacy
-flatbuffers==24.3.25
- # via tensorflow
-floret==0.10.5
- # via textacy
-fonttools==4.54.1
- # via matplotlib
-gast==0.4.0
- # via tensorflow
-google-auth==2.36.0
- # via
- # google-auth-oauthlib
- # tensorboard
-google-auth-oauthlib==1.0.0
- # via tensorboard
-google-pasta==0.2.0
- # via tensorflow
-grpcio==1.67.1
- # via
- # tensorboard
- # tensorflow
-h5py==3.12.1
- # via tensorflow
-idna==3.10
- # via requests
-imbalanced-learn==0.12.4
- # via qrmine (setup.cfg)
-jellyfish==1.1.0
- # via textacy
-jinja2==3.1.6
- # via spacy
-joblib==1.4.2
- # via
- # imbalanced-learn
- # mlxtend
- # scikit-learn
- # textacy
-keras==2.13.1
- # via tensorflow
-kiwisolver==1.4.7
- # via matplotlib
-langcodes==3.4.1
- # via spacy
-language-data==1.2.0
- # via langcodes
-libclang==18.1.1
- # via tensorflow
-marisa-trie==1.2.1
- # via language-data
-markdown==3.7
- # via tensorboard
-markdown-it-py==3.0.0
- # via rich
-markupsafe==3.0.2
- # via
- # jinja2
- # werkzeug
-matplotlib==3.9.2
- # via
- # qrmine (setup.cfg)
- # mlxtend
-mdurl==0.1.2
- # via markdown-it-py
-mlxtend==0.23.2
- # via qrmine (setup.cfg)
-murmurhash==1.0.10
- # via
- # preshed
- # spacy
- # thinc
-networkx==3.4.2
- # via textacy
-numpy==1.24.3
- # via
- # blis
- # contourpy
- # floret
- # h5py
- # imbalanced-learn
- # matplotlib
- # mlxtend
- # pandas
- # scikit-learn
- # scipy
- # spacy
- # tensorboard
- # tensorflow
- # textacy
- # thinc
- # xgboost
-nvidia-nccl-cu12==2.23.4 ; platform_machine != 'aarch64' and platform_system == 'Linux'
- # via xgboost
-oauthlib==3.2.2
- # via requests-oauthlib
-opt-einsum==3.4.0
- # via tensorflow
-packaging==24.2
- # via
- # matplotlib
- # spacy
- # tensorflow
- # thinc
- # weasel
-pandas==2.1.0 ; python_full_version >= '3.12'
- # via
- # qrmine (setup.cfg)
- # mlxtend
-pandas==2.2.3 ; python_full_version < '3.12'
- # via
- # qrmine (setup.cfg)
- # mlxtend
-pillow==11.0.0
- # via matplotlib
-preshed==3.0.9
- # via
- # spacy
- # thinc
-protobuf==4.25.5
- # via
- # tensorboard
- # tensorflow
-pyasn1==0.6.1
- # via
- # pyasn1-modules
- # rsa
-pyasn1-modules==0.4.1
- # via google-auth
-pydantic==1.10.19
- # via
- # confection
- # spacy
- # thinc
- # weasel
-pygments==2.18.0
- # via rich
-pyparsing==3.2.0
- # via matplotlib
-pyphen==0.17.0
- # via textacy
-python-dateutil==2.9.0.post0
- # via
- # matplotlib
- # pandas
-pytz==2024.2
- # via pandas
-requests==2.32.3
- # via
- # requests-oauthlib
- # spacy
- # tensorboard
- # textacy
- # vadersentiment
- # weasel
-requests-oauthlib==2.0.0
- # via google-auth-oauthlib
-rich==13.9.4
- # via typer
-rsa==4.9
- # via google-auth
-scikit-learn==1.5.2
- # via
- # qrmine (setup.cfg)
- # imbalanced-learn
- # mlxtend
- # textacy
-scipy==1.14.1
- # via
- # imbalanced-learn
- # mlxtend
- # scikit-learn
- # textacy
- # xgboost
-setuptools==75.3.0
- # via
- # marisa-trie
- # spacy
- # tensorboard
- # tensorflow
- # thinc
-shellingham==1.5.4
- # via typer
-six==1.16.0
- # via
- # astunparse
- # google-pasta
- # python-dateutil
- # tensorflow
-smart-open==7.0.5
- # via weasel
-spacy==3.7.5
- # via
- # qrmine (setup.cfg)
- # textacy
-spacy-legacy==3.0.12
- # via spacy
-spacy-loggers==1.0.5
- # via spacy
-srsly==2.4.8
- # via
- # confection
- # spacy
- # thinc
- # weasel
-tensorboard==2.13.0
- # via tensorflow
-tensorboard-data-server==0.7.2
- # via tensorboard
-tensorflow==2.13.1
- # via qrmine (setup.cfg)
-tensorflow-estimator==2.13.0
- # via tensorflow
-tensorflow-io-gcs-filesystem==0.31.0
- # via
- # qrmine (setup.cfg)
- # tensorflow
-termcolor==2.5.0
- # via tensorflow
-textacy==0.13.0
- # via qrmine (setup.cfg)
-thinc==8.2.5
- # via spacy
-threadpoolctl==3.5.0
- # via
- # imbalanced-learn
- # scikit-learn
-toolz==1.0.0
- # via cytoolz
-tqdm==4.67.0
- # via
- # spacy
- # textacy
-typer==0.13.0
- # via
- # spacy
- # weasel
-typing-extensions==4.5.0
- # via
- # pydantic
- # tensorflow
- # typer
-tzdata==2024.2
- # via pandas
-urllib3==2.2.3
- # via requests
-vadersentiment==3.3.2
- # via qrmine (setup.cfg)
-wasabi==1.1.3
- # via
- # spacy
- # thinc
- # weasel
-weasel==0.4.1
- # via spacy
-werkzeug==3.1.3
- # via tensorboard
-wheel==0.45.0
- # via
- # astunparse
- # tensorboard
-wrapt==1.16.0
- # via
- # smart-open
- # tensorflow
-xgboost==2.1.2
- # via qrmine (setup.cfg)
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index e6953b9..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,152 +0,0 @@
-# This file is used to configure your project.
-# Read more about the various options under:
-# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
-# https://setuptools.pypa.io/en/latest/references/keywords.html
-
-[metadata]
-name = qrmine
-description = Qualitative Research support tools in Python!
-author = beapen
-author_email = github@gulfdoctor.net
-license = GPL-3.0-only
-# license_files = LICENSE.txt
-# long_description = file: README.rst
-# long_description_content_type = text/x-rst; charset=UTF-8
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/dermatologist/nlp-qrmine
-# Add here related links, for example:
-project_urls =
- Documentation = https://arxiv.org/abs/2003.13519
-# Source = https://github.com/pyscaffold/pyscaffold/
-# Changelog = https://pyscaffold.org/en/latest/changelog.html
-# Tracker = https://github.com/pyscaffold/pyscaffold/issues
-# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
-# Download = https://pypi.org/project/PyScaffold/#files
-# Twitter = https://twitter.com/PyScaffold
-
-# Change if running only on Windows, Mac or Linux (comma-separated)
-platforms = any
-
-# Add here all kinds of additional classifiers as defined under
-# https://pypi.org/classifiers/
-classifiers =
- Intended Audience :: Science/Research
- Development Status :: 4 - Beta
- Operating System :: OS Independent
- Programming Language :: Python :: 3.11
- Topic :: Scientific/Engineering :: Information Analysis
-
-
-[options]
-zip_safe = False
-packages = find_namespace:
-include_package_data = True
-package_dir =
- =src
-
-# Require a min/specific Python version (comma-separated conditions)
-# python_requires = >=3.8
-
-# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
-# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
-# new major versions. This works if the required packages follow Semantic Versioning.
-# For more information, check out https://semver.org/.
-install_requires =
- importlib-metadata; python_version<"3.8"
- pandas
- matplotlib
- click
- scikit-learn
- imbalanced-learn
- vaderSentiment
- xgboost
- mlxtend
- spacy
- textacy
- tensorflow<=2.13.1
- tensorflow-io-gcs-filesystem<=0.31.0
-
-[options.packages.find]
-where = src
-exclude =
- tests
-
-[options.extras_require]
-# Add here additional requirements for extra features, to install with:
-# `pip install qrmine[PDF]` like:
-# PDF = ReportLab; RXP
-
-# Add here test requirements (semicolon/line-separated)
-testing =
- setuptools
- pytest
- pytest-cov
-
-[options.entry_points]
-# Add here console scripts like:
-# console_scripts =
-# script_name = qrmine.module:function
-# For example:
-# console_scripts =
-# fibonacci = qrmine.skeleton:run
-# And any other entry points, for example:
-# pyscaffold.cli =
-# awesome = pyscaffoldext.awesome.extension:AwesomeExtension
-console_scripts =
- qrmine = qrmine.main:main_routine
-
-[tool:pytest]
-# Specify command line options as you would do when invoking pytest directly.
-# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
-# in order to write a coverage file that can be read by Jenkins.
-# CAUTION: --cov flags may prohibit setting breakpoints while debugging.
-# Comment those flags to avoid this pytest issue.
-addopts =
- --verbose
-norecursedirs =
- dist
- build
- .tox
-
-[aliases]
-release = sdist bdist_wheel upload
-
-[bdist_wheel]
-# Use this option if your package is pure-python
-universal = 1
-
-[build_sphinx]
-source_dir = docs
-build_dir = docs/_build
-
-testpaths = tests
-# Use pytest markers to select/deselect specific tests
-# markers =
-# slow: mark tests as slow (deselect with '-m "not slow"')
-# system: mark end-to-end system tests
-
-[devpi:upload]
-# Options for the devpi: PyPI server and packaging tool
-# VCS export must be deactivated since we are using setuptools-scm
-no_vcs = 1
-formats = bdist_wheel
-
-[flake8]
-# Some sane defaults for the code style checker flake8
-max_line_length = 88
-extend_ignore = E203, W503
-# ^ Black-compatible
-# E203 and W503 have edge cases handled by black
-exclude =
- .tox
- build
- dist
- .eggs
- docs/conf.py
-
-[pyscaffold]
-# PyScaffold's parameters when the project was created.
-# This will be used when updating. Do not change!
-version = 4.6
-package = qrmine
diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py
index 09a4e35..3549721 100644
--- a/src/qrmine/__init__.py
+++ b/src/qrmine/__init__.py
@@ -6,6 +6,8 @@
from .readfiles import ReadData
from .sentiment import Sentiment
from .mlqrmine import MLQRMine
+from .cluster import ClusterDocs
+from .visualize import QRVisualize
if sys.version_info[:2] >= (3, 8):
# TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
new file mode 100644
index 0000000..3e68ac3
--- /dev/null
+++ b/src/qrmine/cluster.py
@@ -0,0 +1,189 @@
+"""
+Copyright (C) 2025 Bell Eapen
+
+This file is part of qrmine.
+
+qrmine is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+qrmine is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with qrmine. If not, see .
+"""
+
+from pprint import pprint
+
+import pandas as pd
+import spacy
+from gensim import corpora
+from gensim.models.ldamodel import LdaModel
+
+
+class ClusterDocs:
+
+ def __init__(self, documents=[], titles=[]):
+ self._nlp = spacy.load("en_core_web_sm")
+ self._documents = documents
+ self._titles = titles
+ self._num_topics = 5
+ self._passes = 15
+ self._dictionary = None
+ self._corpus = None
+ self._lda_model = None
+ # Apply preprocessing to each document
+ self._processed_docs = [self.preprocess(doc) for doc in documents]
+ self.process()
+
+ @property
+ def documents(self):
+ return self._documents
+
+ @property
+ def titles(self):
+ return self._titles
+
+ @property
+ def num_topics(self):
+ return self._num_topics
+
+ @property
+ def passes(self):
+ return self._passes
+
+ @property
+ def processed_docs(self):
+ return self._processed_docs
+
+ @documents.setter
+ def documents(self, documents):
+ self._documents = documents
+ self._processed_docs = [self.preprocess(doc) for doc in documents]
+ self.process()
+
+ @titles.setter
+ def titles(self, titles):
+ self._titles = titles
+
+ @num_topics.setter
+ def num_topics(self, num_topics):
+ self._num_topics = num_topics
+
+ @passes.setter
+ def passes(self, passes):
+ self._passes = passes
+
+ # Preprocess the documents using spaCy
+ def preprocess(self, doc):
+ # Tokenize and preprocess each document
+ doc = self._nlp(doc)
+ # Lemmatize and remove stop words
+ tokens = [token.lemma_ for token in doc if not token.is_stop]
+ return tokens
+
+ def process(self):
+ # Create a dictionary representation of the documents
+ self._dictionary = corpora.Dictionary(self._processed_docs)
+ # Create a bag-of-words representation of the documents
+ self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs]
+ # Build the LDA (Latent Dirichlet Allocation) model
+
+ def build_lda_model(self):
+ if self._lda_model is None:
+ self._lda_model = LdaModel(
+ self._corpus,
+ num_topics=self._num_topics,
+ id2word=self._dictionary,
+ passes=self._passes,
+ )
+ return self._lda_model.show_topics(formatted=False)
+
+ def print_topics(self, num_words=5):
+ if self._lda_model is None:
+ self.build_lda_model()
+ # Print the topics and their corresponding words
+ pprint(self._lda_model.print_topics(num_words=num_words))
+
+ def print_clusters(self):
+ if self._lda_model is None:
+ self.build_lda_model()
+ # Perform semantic clustering
+ for i, doc in enumerate(
+ self._processed_docs
+ ): # Changed from get_processed_docs() to _documents
+ bow = self._dictionary.doc2bow(doc)
+ print(
+ f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}"
+ )
+
+ def format_topics_sentences(self):
+ self.build_lda_model()
+ # Init output
+ sent_topics_df = pd.DataFrame()
+
+ # Get main topic in each document
+ for i, row_list in enumerate(self._lda_model[self._corpus]):
+ row = row_list[0] if self._lda_model.per_word_topics else row_list
+ # print(row)
+ row = sorted(row, key=lambda x: (x[1]), reverse=True)
+ # Get the Dominant topic, Perc Contribution and Keywords for each document
+ for j, (topic_num, prop_topic) in enumerate(row):
+ if j == 0: # => dominant topic
+ wp = self._lda_model.show_topic(topic_num)
+ topic_keywords = ", ".join([word for word, prop in wp])
+ new_row = pd.DataFrame(
+ [[int(topic_num), round(prop_topic, 4), topic_keywords]],
+ columns=[
+ "Dominant_Topic",
+ "Perc_Contribution",
+ "Topic_Keywords",
+ ],
+ )
+ sent_topics_df = pd.concat(
+ [sent_topics_df, new_row], ignore_index=True
+ )
+ else:
+ break
+ sent_topics_df.columns = [
+ "Dominant_Topic",
+ "Perc_Contribution",
+ "Topic_Keywords",
+ ]
+
+ # Add original text to the end of the output
+ contents = pd.Series(self._processed_docs)
+ sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
+ return sent_topics_df.reset_index(drop=False)
+
+ # https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
+ def most_representative_docs(self):
+ sent_topics_df = self.format_topics_sentences()
+ sent_topics_sorteddf_mallet = pd.DataFrame()
+ sent_topics_outdf_grpd = sent_topics_df.groupby("Dominant_Topic")
+
+ for i, grp in sent_topics_outdf_grpd:
+ sent_topics_sorteddf_mallet = pd.concat(
+ [
+ sent_topics_sorteddf_mallet,
+ grp.sort_values(["Perc_Contribution"], ascending=False).head(1),
+ ],
+ axis=0,
+ )
+
+ return sent_topics_sorteddf_mallet
+
+ def topics_per_document(self, start=0, end=1):
+ corpus_sel = self._corpus[start:end]
+ dominant_topics = []
+ topic_percentages = []
+ for i, corp in enumerate(corpus_sel):
+ topic_percs = self._lda_model[corp]
+ dominant_topic = sorted(topic_percs, key=lambda x: x[1], reverse=True)[0][0]
+ dominant_topics.append((i, dominant_topic))
+ topic_percentages.append(topic_percs)
+ return (dominant_topics, topic_percentages)
diff --git a/src/qrmine/content.py b/src/qrmine/content.py
index 3344a80..f9e6b0e 100644
--- a/src/qrmine/content.py
+++ b/src/qrmine/content.py
@@ -87,6 +87,10 @@ def idx(self, token):
def doc(self):
return self._processed
+ @property
+ def tokens(self):
+ return [token for token in self._processed if not token.is_stop and not token.is_punct and not token.is_space]
+
def process(self):
for token in self._processed:
if token.is_stop or token.is_digit or token.is_punct or token.is_space:
diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py
index 12b75a3..fcfac7a 100644
--- a/src/qrmine/mlqrmine.py
+++ b/src/qrmine/mlqrmine.py
@@ -1,13 +1,10 @@
import numpy
from imblearn.over_sampling import RandomOverSampler
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense
-from numpy import random, argsort, sqrt, array, ones
from pandas import read_csv
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KDTree
@@ -17,6 +14,25 @@
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
+import torch.nn as nn
+import torch.optim as optim
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+class NeuralNet(nn.Module):
+ def __init__(self, input_dim):
+ super(NeuralNet, self).__init__()
+ self.fc1 = nn.Linear(input_dim, 12)
+ self.fc2 = nn.Linear(12, 8)
+ self.fc3 = nn.Linear(8, 1)
+ self.relu = nn.ReLU()
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, x):
+ x = self.relu(self.fc1(x))
+ x = self.relu(self.fc2(x))
+ x = self.sigmoid(self.fc3(x))
+ return x
+
class MLQRMine(object):
@@ -24,13 +40,13 @@ def __init__(self):
self._seed = randint(1, 9)
self._csvfile = ""
self._titles = None
+ self._model = None
self._dataset = None
self._X = None
self._y = None
self._X_original = None
self._y_original = None
self._dataset_original = None
- self._model = Sequential()
self._sc = StandardScaler()
self._vnum = 0 # Number of variables
self._classifier = XGBClassifier()
@@ -147,22 +163,57 @@ def prepare_data(self, oversample=False):
self.oversample()
def get_nnet_predictions(self):
- self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu'))
- self._model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
- self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
- # Compile model
- self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
- # Fit the model
- self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2)
-
- # calculate predictions
- predictions = self._model.predict(self._X_original)
- # round predictions
- rounded = [round(x[0]) for x in predictions]
+
+ self._model = NeuralNet(self._vnum)
+ criterion = nn.BCELoss()
+ optimizer = optim.Adam(self._model.parameters(), lr=0.001)
+
+ # Convert data to PyTorch tensors
+ X_tensor = torch.tensor(self._X, dtype=torch.float32)
+ y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1)
+
+ # Create a dataset and data loader
+ dataset = TensorDataset(X_tensor, y_tensor)
+ dataloader = DataLoader(dataset, batch_size=10, shuffle=True)
+
+ # Train the model
+ for epoch in range(self._epochs):
+ for batch_X, batch_y in dataloader:
+ optimizer.zero_grad()
+ outputs = self._model(batch_X)
+ loss = criterion(outputs, batch_y)
+ loss.backward()
+ optimizer.step()
+
+ # Calculate predictions
+ with torch.no_grad():
+ predictions = self._model(torch.tensor(self._X_original, dtype=torch.float32))
+ rounded = [round(x.item()) for x in predictions]
+ # print("Predictions: ", rounded)
+ # Calculate accuracy
+ correct = sum([1 for i in range(len(rounded)) if rounded[i] == self._y_original[i]])
+ total = len(rounded)
+ accuracy = correct / total
+ print(f'Accuracy: {accuracy * 100:.2f}%')
return rounded
def get_nnet_scores(self):
- return self._model.evaluate(self._X, self._y)
+ # evalute the pytorch model
+ self._model.eval()
+ X_tensor = torch.tensor(self._X, dtype=torch.float32)
+ y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1)
+ dataset = TensorDataset(X_tensor, y_tensor)
+ dataloader = DataLoader(dataset, batch_size=10, shuffle=True)
+ correct = 0
+ total = 0
+ with torch.no_grad():
+ for batch_X, batch_y in dataloader:
+ outputs = self._model(batch_X)
+ predicted = (outputs > 0.5).float()
+ total += batch_y.size(0)
+ correct += (predicted == batch_y).sum().item()
+ accuracy = correct / total
+ print(f'Accuracy: {accuracy * 100:.2f}%')
def svm_confusion_matrix(self):
"""Generate confusion matrix for SVM
@@ -211,7 +262,6 @@ def get_centroids(self, c=1):
print("Mean")
print(self._dataset.iloc[cluster_list, :].mean(axis=0))
-
"""
TODO: This is not working yet.
use the ColumnTransformer instead of categorical_features
diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py
index a460795..a213ff7 100644
--- a/src/qrmine/readfiles.py
+++ b/src/qrmine/readfiles.py
@@ -1,5 +1,6 @@
import re
-
+import requests
+from pypdf import PdfReader
class ReadData(object):
def __init__(self):
@@ -37,22 +38,10 @@ def append(self, title, document):
self._documents.append(document)
self._content += document
- def read_file(self, file_names):
- if len(file_names) > 1:
- for file_name in file_names:
- with open(file_name, 'r') as f:
- read_from_file = f.read()
- self._content = re.sub('<[^<]+?>', '', read_from_file)
- self._documents = re.split('.*?', read_from_file)
- # Delete the last blank record
- del self._documents[-1]
- pattern = r"(.*?)"
- _title = re.findall(pattern, read_from_file, flags=re.DOTALL)[0]
- self._titles.append(_title)
- f.close()
- else:
- file_name = file_names[0]
- with open(file_name, 'r') as f:
+ def read_file(self, input):
+ # if input is a file name
+ if isinstance(input, str):
+ with open(input, 'r') as f:
read_from_file = f.read()
self._content = re.sub('<[^<]+?>', '', read_from_file)
self._documents = re.split('.*?', read_from_file)
@@ -60,25 +49,51 @@ def read_file(self, file_names):
del self._documents[-1]
pattern = r"(.*?)"
self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL)
+ # if input is a folder name
+ elif isinstance(input, str):
+ import os
+ for file_name in os.listdir(input):
+ if file_name.endswith('.txt'):
+ with open(os.path.join(input, file_name), 'r') as f:
+ read_from_file = f.read()
+ self._content += read_from_file
+ self._documents.append(read_from_file)
+ self.titles.append(file_name)
+ if file_name.endswith('.pdf'):
+ with open(os.path.join(input, file_name), 'rb') as f:
+ reader = PdfReader(f)
+ read_from_file = ""
+ for page in reader.pages:
+ read_from_file += page.extract_text()
+ self._content += read_from_file
+ self._documents.append(read_from_file)
+ self.titles.append(file_name)
+ # if input is a url
+ elif isinstance(input, str):
+ response = requests.get(input)
+ if response.status_code == 200:
+ read_from_file = response.text
+ self._content = read_from_file
+ self._documents.append(read_from_file)
+ self.titles.append(input)
+ else:
+ raise ValueError("Input must be a file name, folder name or url.")
- """
- Combine duplicate topics using Dict
- Currently supported only for single file.
- """
-
- doc_dict = {}
- ct3 = 0
- for t in self._titles:
- doc = doc_dict.get(t)
- if doc:
- doc_dict[t] = doc + self._documents[ct3]
- else:
- doc_dict[t] = self._documents[ct3]
- ct3 += 1
- self._titles.clear()
- self._documents.clear()
- for t in doc_dict.keys():
- self._documents.append(doc_dict.get(t))
- self._titles.append(t)
+ """
+ Combine duplicate topics using Dict
+ """
- f.close()
+ doc_dict = {}
+ ct3 = 0
+ for t in self._titles:
+ doc = doc_dict.get(t)
+ if doc:
+ doc_dict[t] = doc + self._documents[ct3]
+ else:
+ doc_dict[t] = self._documents[ct3]
+ ct3 += 1
+ self._titles.clear()
+ self._documents.clear()
+ for t in doc_dict.keys():
+ self._documents.append(doc_dict.get(t))
+ self._titles.append(t)
diff --git a/src/qrmine/resources/df_dominant_topic.csv b/src/qrmine/resources/df_dominant_topic.csv
new file mode 100644
index 0000000..115eb63
--- /dev/null
+++ b/src/qrmine/resources/df_dominant_topic.csv
@@ -0,0 +1,12 @@
+,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
+0,0,4,0.9903,"., GT, Strauss, ,, coding,
+, ), Theory, seminal, (","['ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']"
+1,1,1,0.7811,",, theory, ., GT, evaluation, structure, coding,
+, ), (","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']"
+2,2,1,0.9783,",, theory, ., GT, evaluation, structure, coding,
+, ), (","['\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n']"
+3,3,3,0.9952,"., ,, coding, category, open, QRMine, datum, researcher, code, GT","['\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n']"
+4,4,4,0.9793,"., GT, Strauss, ,, coding,
+, ), Theory, seminal, (","['\n', 'ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n']"
+5,5,2,0.9712,"category, comparison, incident, ,,
+, involve, refine, identify, emergence, constant","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n']"
diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
new file mode 100644
index 0000000..4a7fc25
--- /dev/null
+++ b/src/qrmine/visualize.py
@@ -0,0 +1,390 @@
+"""
+Copyright (C) 2025 Bell Eapen
+
+This file is part of qrmine.
+
+qrmine is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+qrmine is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with qrmine. If not, see .
+"""
+
+from collections import Counter
+
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib.patches import Rectangle
+from matplotlib.ticker import FuncFormatter
+from sklearn.manifold import TSNE
+from wordcloud import STOPWORDS, WordCloud
+
+
+class QRVisualize:
+ def __init__(self, data: pd.DataFrame = None):
+ """
+ Initialize the QRVisualize class with a DataFrame.
+
+ Parameters:
+ data (pd.DataFrame): The DataFrame containing the data to visualize.
+ """
+ self.data = data
+
+ def plot_frequency_distribution_of_words(self, df=None, folder_path=None):
+ if df is None:
+ df = self.data
+ doc_lens = [len(d) for d in df.Text]
+
+ # Plot
+ plt.figure(figsize=(16, 7), dpi=160)
+ plt.hist(doc_lens, bins=1000, color="navy")
+ plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens))))
+ plt.text(750, 90, "Median : " + str(round(np.median(doc_lens))))
+ plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens))))
+ plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01))))
+ plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99))))
+
+ plt.gca().set(
+ xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count"
+ )
+ plt.tick_params(size=16)
+ plt.xticks(np.linspace(0, 1000, 9))
+ plt.title("Distribution of Document Word Counts", fontdict=dict(size=22))
+ plt.show()
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def plot_distribution_by_topic(self, df=None, folder_path=None):
+ if df is None:
+ df = self.data
+ # Plot
+ cols = [
+ color for name, color in mcolors.TABLEAU_COLORS.items()
+ ] # more colors: 'mcolors.XKCD_COLORS'
+
+ fig, axes = plt.subplots(
+ 2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True
+ )
+
+ for i, ax in enumerate(axes.flatten()):
+ df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :]
+ doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
+ ax.hist(doc_lens, bins=1000, color=cols[i])
+ ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i])
+ sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
+ ax.set(xlim=(0, 1000), xlabel="Document Word Count")
+ ax.set_ylabel("Number of Documents", color=cols[i])
+ ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i]))
+
+ fig.tight_layout()
+ fig.subplots_adjust(top=0.90)
+ plt.xticks(np.linspace(0, 1000, 9))
+ fig.suptitle(
+ "Distribution of Document Word Counts by Dominant Topic", fontsize=22
+ )
+ plt.show()
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def plot_wordcloud(self, topics=None, folder_path=None):
+ cols = [
+ color for name, color in mcolors.TABLEAU_COLORS.items()
+ ] # more colors: 'mcolors.XKCD_COLORS'
+
+ cloud = WordCloud(
+ stopwords=STOPWORDS,
+ background_color="white",
+ width=250,
+ height=180,
+ max_words=5,
+ colormap="tab10",
+ color_func=lambda *args, **kwargs: cols[i],
+ prefer_horizontal=1.0,
+ )
+
+ fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)
+
+ for i, ax in enumerate(axes.flatten()):
+ fig.add_subplot(ax)
+ topic_words = dict(topics[i][1])
+ cloud.generate_from_frequencies(topic_words, max_font_size=300)
+ plt.gca().imshow(cloud)
+ plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16))
+ plt.gca().axis("off")
+
+ plt.subplots_adjust(wspace=0, hspace=0)
+ plt.axis("off")
+ plt.margins(x=0, y=0)
+ plt.tight_layout()
+ plt.show()
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def plot_importance(self, topics=None, processed_docs=None, folder_path=None):
+ data_flat = [w for w_list in processed_docs for w in w_list]
+ counter = Counter(data_flat)
+
+ out = []
+ for i, topic in topics:
+ for word, weight in topic:
+ out.append([word, i, weight, counter[word]])
+
+ df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"])
+
+ # Plot Word Count and Weights of Topic Keywords
+ fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160)
+ cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
+ for i, ax in enumerate(axes.flatten()):
+ ax.bar(
+ x="word",
+ height="word_count",
+ data=df.loc[df.topic_id == i, :],
+ color=cols[i],
+ width=0.5,
+ alpha=0.3,
+ label="Word Count",
+ )
+ ax_twin = ax.twinx()
+ ax_twin.bar(
+ x="word",
+ height="importance",
+ data=df.loc[df.topic_id == i, :],
+ color=cols[i],
+ width=0.2,
+ label="Weights",
+ )
+ ax.set_ylabel("Word Count", color=cols[i])
+ ax_twin.set_ylim(0, 0.030)
+ ax.set_ylim(0, 3500)
+ ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16)
+ ax.tick_params(axis="y", left=False)
+ ax.set_xticklabels(
+ df.loc[df.topic_id == i, "word"],
+ rotation=30,
+ horizontalalignment="right",
+ )
+ ax.legend(loc="upper left")
+ ax_twin.legend(loc="upper right")
+
+ fig.tight_layout(w_pad=2)
+ fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05)
+ plt.show()
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13):
+ corp = corpus[start:end]
+ mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()]
+
+ fig, axes = plt.subplots(
+ end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160
+ )
+ axes[0].axis("off")
+ for i, ax in enumerate(axes):
+ if i > 0:
+ corp_cur = corp[i - 1]
+ topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur]
+ word_dominanttopic = [
+ (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics
+ ]
+ ax.text(
+ 0.01,
+ 0.5,
+ "Doc " + str(i - 1) + ": ",
+ verticalalignment="center",
+ fontsize=16,
+ color="black",
+ transform=ax.transAxes,
+ fontweight=700,
+ )
+
+ # Draw Rectange
+ topic_percs_sorted = sorted(
+ topic_percs, key=lambda x: (x[1]), reverse=True
+ )
+ ax.add_patch(
+ Rectangle(
+ (0.0, 0.05),
+ 0.99,
+ 0.90,
+ fill=None,
+ alpha=1,
+ color=mycolors[topic_percs_sorted[0][0]],
+ linewidth=2,
+ )
+ )
+
+ word_pos = 0.06
+ for j, (word, topics) in enumerate(word_dominanttopic):
+ if j < 14:
+ ax.text(
+ word_pos,
+ 0.5,
+ word,
+ horizontalalignment="left",
+ verticalalignment="center",
+ fontsize=16,
+ color=mycolors[topics],
+ transform=ax.transAxes,
+ fontweight=700,
+ )
+ word_pos += 0.009 * len(
+ word
+ ) # to move the word for the next iter
+ ax.axis("off")
+ ax.text(
+ word_pos,
+ 0.5,
+ ". . .",
+ horizontalalignment="left",
+ verticalalignment="center",
+ fontsize=16,
+ color="black",
+ transform=ax.transAxes,
+ )
+
+ plt.subplots_adjust(wspace=0, hspace=0)
+ plt.suptitle(
+ "Sentence Topic Coloring for Documents: "
+ + str(start)
+ + " to "
+ + str(end - 2),
+ fontsize=22,
+ y=0.95,
+ fontweight=700,
+ )
+ plt.tight_layout()
+ plt.show()
+
+ def cluster_chart(self, lda_model=None, corpus=None, n_topics=4, folder_path=None):
+ # Get topic weights
+ topic_weights = []
+ for i, row_list in enumerate(lda_model[corpus]):
+ topic_weights.append([w for i, w in row_list[0]])
+
+ # Array of topic weights
+ arr = pd.DataFrame(topic_weights).fillna(0).values
+
+ # Keep the well separated points (optional)
+ arr = arr[np.amax(arr, axis=1) > 0.35]
+
+ # Dominant topic number in each doc
+ topic_num = np.argmax(arr, axis=1)
+
+ # tSNE Dimension Reduction
+ tsne_model = TSNE(
+ n_components=2, verbose=1, random_state=0, angle=0.99, init="pca"
+ )
+ tsne_lda = tsne_model.fit_transform(arr)
+
+ # Plot
+ plt.figure(figsize=(16, 10), dpi=160)
+ for i in range(n_topics):
+ plt.scatter(
+ tsne_lda[topic_num == i, 0],
+ tsne_lda[topic_num == i, 1],
+ label=str(i),
+ alpha=0.5,
+ )
+ plt.title("t-SNE Clustering of Topics", fontsize=22)
+ plt.xlabel("t-SNE Dimension 1", fontsize=16)
+ plt.ylabel("t-SNE Dimension 2", fontsize=16)
+ plt.legend(title="Topic Number", loc="upper right")
+ plt.show()
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def most_discussed_topics(
+ self, lda_model, dominant_topics, topic_percentages, folder_path=None
+ ):
+
+ # Distribution of Dominant Topics in Each Document
+ df = pd.DataFrame(dominant_topics, columns=["Document_Id", "Dominant_Topic"])
+ dominant_topic_in_each_doc = df.groupby("Dominant_Topic").size()
+ df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(
+ name="count"
+ ).reset_index()
+
+ # Total Topic Distribution by actual weight
+ topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
+ df_topic_weightage_by_doc = (
+ topic_weightage_by_doc.sum().to_frame(name="count").reset_index()
+ )
+
+ # Top 3 Keywords for each Topic
+ topic_top3words = [
+ (i, topic)
+ for i, topics in lda_model.show_topics(formatted=False)
+ for j, (topic, wt) in enumerate(topics)
+ if j < 3
+ ]
+
+ df_top3words_stacked = pd.DataFrame(
+ topic_top3words, columns=["topic_id", "words"]
+ )
+ df_top3words = df_top3words_stacked.groupby("topic_id").agg(", \n".join)
+ df_top3words.reset_index(level=0, inplace=True)
+
+ # Plot
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True)
+
+ # Topic Distribution by Dominant Topics
+ ax1.bar(
+ x="Dominant_Topic",
+ height="count",
+ data=df_dominant_topic_in_each_doc,
+ width=0.5,
+ color="firebrick",
+ )
+ ax1.set_xticks(
+ range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__())
+ )
+ tick_formatter = FuncFormatter(
+ lambda x, pos: "Topic "
+ + str(x)
+ + "\n"
+ + df_top3words.loc[df_top3words.topic_id == x, "words"].values[0]
+ )
+ ax1.xaxis.set_major_formatter(tick_formatter)
+ ax1.set_title("Number of Documents by Dominant Topic", fontdict=dict(size=10))
+ ax1.set_ylabel("Number of Documents")
+ ax1.set_ylim(0, 1000)
+
+ # Topic Distribution by Topic Weights
+ ax2.bar(
+ x="index",
+ height="count",
+ data=df_topic_weightage_by_doc,
+ width=0.5,
+ color="steelblue",
+ )
+ ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
+ ax2.xaxis.set_major_formatter(tick_formatter)
+ ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10))
+
+ plt.show()
+
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..a5c4b31
--- /dev/null
+++ b/test.py
@@ -0,0 +1,33 @@
+import spacy
+
+# Load spaCy model
+nlp = spacy.load("en_core_web_sm")
+
+# Sample documents
+documents = [
+ "Natural language processing is a field of AI.",
+ "Topic modeling helps in uncovering the main themes in a collection of documents.",
+ "Semantic clustering groups similar documents together based on meaning.",
+ "SpaCy is a popular NLP library.",
+ "Gensim is commonly used for topic modeling.",
+]
+
+
+# Preprocess the documents using spaCy
+def preprocess(doc):
+ # Tokenize and preprocess each document
+ doc = nlp(doc)
+ print(f"Original Document: {doc}")
+ # Lemmatize and remove stop words
+ tokens = [token.lemma_ for token in doc if not token.is_stop]
+ print(f"Processed Tokens: {tokens}")
+ return tokens
+
+
+# Apply preprocessing to each document
+processed_docs = [preprocess(doc) for doc in documents]
+
+
+# Print the processed documents
+for i, doc in enumerate(processed_docs):
+ print(f"Document {i + 1}: {doc}")
\ No newline at end of file
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 4ad331d..6c922a5 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -1,32 +1,45 @@
import pytest
-
@pytest.fixture
def corpus_fixture():
from pkg_resources import resource_filename
from src.qrmine import ReadData
+
corpus = ReadData()
- file_path = resource_filename('src.qrmine.resources', 'interview.txt')
- corpus.read_file([file_path])
+ file_path = resource_filename("src.qrmine.resources", "interview.txt")
+ corpus.read_file(file_path)
return corpus
+
# instannce of Qrmine as fixture
@pytest.fixture
def q():
from src.qrmine import Qrmine
+
_q = Qrmine()
return _q
+
+@pytest.fixture
+def cluster():
+ from src.qrmine import ClusterDocs
+
+ _cluster = ClusterDocs()
+ return _cluster
+
+
# Ref: https://docs.pytest.org/en/latest/capture.html
def test_generate_dict(corpus_fixture, capsys, q):
from src.qrmine import Content
+
num = 10
all_interviews = Content(corpus_fixture.content)
q.print_dict(all_interviews, num)
captured = capsys.readouterr()
print(captured.out)
- assert 'code' in captured.out
+ assert "code" in captured.out
+
def test_generate_topics(corpus_fixture, capsys, q):
q.content = corpus_fixture
@@ -34,22 +47,53 @@ def test_generate_topics(corpus_fixture, capsys, q):
q.print_topics()
captured = capsys.readouterr()
print(captured.out)
- assert 'TOPIC' in captured.out
+ assert "TOPIC" in captured.out
+
def test_category_basket(corpus_fixture, capsys, q):
q.content = corpus_fixture
print(q.category_basket())
captured = capsys.readouterr()
print(captured.out)
- assert 'theory' in captured.out
+ assert "theory" in captured.out
+
def test_category_association(corpus_fixture, capsys, q):
q.content = corpus_fixture
print(q.category_association())
captured = capsys.readouterr()
print(captured.out)
- assert 'theory' in captured.out
+ assert "theory" in captured.out
+
+def test_cluster_topics(corpus_fixture, capsys, cluster):
+ cluster.documents = corpus_fixture.documents
+ cluster.titles = corpus_fixture.titles
+ cluster.print_clusters()
+ captured = capsys.readouterr()
+ print(captured.out)
+ assert "Document" in captured.out
+ cluster.print_topics()
+ captured = capsys.readouterr()
+ print(captured.out)
+ assert "topic" in captured.out
+ print(cluster.build_lda_model())
+ print(cluster.topics_per_document())
+ # Format
+ df_dominant_topic = cluster.format_topics_sentences()
+ # Format the output
+ df_dominant_topic.columns = [
+ "Document_No",
+ "Dominant_Topic",
+ "Topic_Perc_Contrib",
+ "Keywords",
+ "Text",
+ ]
+ print(df_dominant_topic.head(10))
+ assert "Document_No" in df_dominant_topic.columns
+ df_sorted = cluster.most_representative_docs()
+ print(df_sorted.head(10))
+ assert "Dominant_Topic" in df_sorted.columns
diff --git a/tests/test_num.py b/tests/test_num.py
index f0c53cd..ac7a139 100644
--- a/tests/test_num.py
+++ b/tests/test_num.py
@@ -9,7 +9,7 @@ def ml_fixture():
ml = MLQRMine()
file_path = resource_filename('src.qrmine.resources', 'numeric.csv')
ml.csvfile = file_path
- return ml
+ return ml
@@ -19,7 +19,7 @@ def test_nn(ml_fixture, capsys):
ml_fixture.prepare_data(True)
ml_fixture.get_nnet_predictions()
captured = capsys.readouterr()
- assert 'accuracy' in captured.out
+ assert 'Accuracy' in captured.out
def test_svm(ml_fixture, capsys):
ml_fixture.prepare_data(True)
diff --git a/tests/test_readfiles.py b/tests/test_readfiles.py
index aff3a5d..963ed90 100644
--- a/tests/test_readfiles.py
+++ b/tests/test_readfiles.py
@@ -8,8 +8,8 @@ def corpus_fixture():
from src.qrmine import ReadData
corpus = ReadData()
file_path = resource_filename('src.qrmine.resources', 'interview.txt')
- corpus.read_file([file_path])
- return corpus
+ corpus.read_file(file_path)
+ return corpus
def test_content(corpus_fixture):
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
new file mode 100644
index 0000000..41f7145
--- /dev/null
+++ b/tests/test_visualize.py
@@ -0,0 +1,114 @@
+import pytest
+import pandas as pd
+from src.qrmine.visualize import QRVisualize
+
+
+@pytest.fixture
+def v():
+ from pkg_resources import resource_filename
+
+ file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv")
+ data = pd.read_csv(file_path)
+ _v = QRVisualize(data)
+ return _v
+
+
+@pytest.fixture
+def topics():
+ return [
+ (
+ 0,
+ [
+ (".", 0.095292516),
+ (",", 0.053392828),
+ ("category", 0.032462463),
+ ("coding", 0.032456465),
+ ("open", 0.032437164),
+ ("QRMine", 0.03243305),
+ ("datum", 0.021980358),
+ ("researcher", 0.021978099),
+ ("theory", 0.011536299),
+ ("GT", 0.011533132),
+ ],
+ ),
+ (
+ 1,
+ [
+ (".", 0.007783216),
+ (",", 0.007773952),
+ ("open", 0.007728422),
+ ("researcher", 0.0077227736),
+ ("coding", 0.007722049),
+ ("category", 0.007721938),
+ ("datum", 0.007717547),
+ ("QRMine", 0.007716193),
+ ("dissect", 0.0077070068),
+ ("support", 0.0077060354),
+ ],
+ ),
+ (
+ 2,
+ [
+ (",", 0.05126711),
+ (".", 0.05125151),
+ ("theory", 0.038604487),
+ ("category", 0.03227912),
+ ("GT", 0.032278605),
+ ("\n", 0.029119665),
+ ("comparison", 0.025947908),
+ ("coding", 0.025941858),
+ ("incident", 0.019622542),
+ (")", 0.019619444),
+ ],
+ ),
+ (
+ 3,
+ [
+ (".", 0.007849805),
+ (",", 0.007837688),
+ ("theory", 0.00781459),
+ ("coding", 0.0078089647),
+ ("category", 0.0077514737),
+ ("GT", 0.0077493717),
+ ("datum", 0.007742789),
+ ("open", 0.0077355755),
+ ("\n", 0.0077245855),
+ ("researcher", 0.0077191954),
+ ],
+ ),
+ (
+ 4,
+ [
+ (",", 0.007834569),
+ (".", 0.007812336),
+ ("coding", 0.0077863215),
+ ("category", 0.007759207),
+ ("theory", 0.0077459146),
+ ("GT", 0.0077370973),
+ ("code", 0.0077265715),
+ ("datum", 0.007720947),
+ ("open", 0.007720898),
+ ("comparison", 0.007720567),
+ ],
+ ),
+ ]
+
+
+def test_frequency_distribution_of_words(v, capsys):
+ v.plot_frequency_distribution_of_words(
+ v.data
+ )
+ captured = capsys.readouterr()
+ print(captured.out)
+
+
+def test_distribution_by_topic(v, capsys):
+ v.plot_distribution_by_topic(v.data)
+ captured = capsys.readouterr()
+ print(captured.out)
+
+
+def test_plot_wordcloud(v, topics, capsys):
+ v.plot_wordcloud(topics)
+ captured = capsys.readouterr()
+ print(captured.out)
diff --git a/tox.ini b/tox.ini
index 3eb707d..dbb293d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,9 +8,6 @@ envlist = py311, integration
[testenv]
setenv = TOXINIDIR = {toxinidir}
-deps =
- -rrequirements.txt
- -rdev-requirements.txt
commands =
python -m spacy download en_core_web_sm
py.test {posargs}
@@ -20,9 +17,6 @@ extras =
[testenv:integration]
setenv = TOXINIDIR = {toxinidir}
-deps =
- -rrequirements.txt
- -rdev-requirements.txt
commands =
python -m spacy download en_core_web_sm
python qrminer.py
\ No newline at end of file