diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 7b25612..be1ae17 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -11,21 +11,26 @@ jobs:
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
- - name: Set up Python
- uses: actions/setup-python@v4
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
with:
- python-version: '3.11'
- - name: Install dependencies
+ enable-cache: true
+ - name: "Set up Python"
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: "pyproject.toml"
+
+ - name: Install the project
run: |
- python -m pip install --upgrade pip
- pip install -r requirements.txt -r dev-requirements.txt
- python -m spacy download en_core_web_sm
+ uv sync --all-extras --dev
+ uv pip install pip
+ uv run python -m spacy download en_core_web_sm
- name: Create docs
run: |
- make -C docs/ html
+ uv run python -m sphinx -b html docs/ docs/_build/html
cp docs/_config.yml docs/_build/html/_config.yml
- name: Deploy Docs ๐
- uses: JamesIves/github-pages-deploy-action@v4.2.5
+ uses: JamesIves/github-pages-deploy-action@v4
with:
branch: gh-pages # The branch the action should deploy to.
- folder: docs/_build/html # The folder the action should deploy.
\ No newline at end of file
+ folder: docs/_build/html # The folder the action should deploy.
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index f742724..0a96d34 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -1,4 +1,4 @@
-name: Pytest on PR
+name: Pytest using UV on PR
on:
push:
branches:
@@ -13,27 +13,27 @@ jobs:
strategy:
max-parallel: 4
matrix:
- python-version: ["3.11"]
os: [ubuntu-latest, macos-13, windows-latest]
runs-on: ${{ matrix.os }}
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v4
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
with:
- python-version: ${{ matrix.python-version }}
- cache: 'pip' # caching pip dependencies
+ enable-cache: true
+ - name: "Set up Python"
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: "pyproject.toml"
- name: run on mac
if: startsWith(matrix.os, 'mac')
run: |
brew install libomp
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install -r requirements.txt
- python -m spacy download en_core_web_sm
- - name: Test with pytest
+ - name: Install the project
run: |
- pip install pytest
- pytest
+ uv sync --all-extras --dev
+ uv pip install pip
+ uv run python -m spacy download en_core_web_sm
+ - name: Run tests
+ run: uv run pytest tests
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9018711..6a2859f 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -11,20 +11,25 @@ jobs:
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
- - name: Set up Python
- uses: actions/setup-python@v5.1.1
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
with:
- python-version: '3.11'
- - name: Install dependencies
+ enable-cache: true
+ - name: "Set up Python"
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: "pyproject.toml"
+ - name: Install the project
run: |
- python -m pip install --upgrade pip
- pip install -r dev-requirements.txt
+ uv sync --all-extras --dev
+ uv pip install pip
+ uv run python -m spacy download en_core_web_sm
- name: Build and publish
run: |
- python setup.py bdist_wheel
+ uv run python setup.py bdist_wheel
- name: Publish distribution ๐ฆ to PyPI
if: startsWith(github.ref, 'refs/tags')
- uses: pypa/gh-action-pypi-publish@master
+ uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
- password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
+ password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml
index 2b436ed..cb188de 100644
--- a/.github/workflows/tox.yml
+++ b/.github/workflows/tox.yml
@@ -17,15 +17,20 @@ jobs:
steps:
- uses: actions/checkout@v4
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5.1.1
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
+ enable-cache: true
+ - name: "Set up Python"
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: "pyproject.toml"
+
+ - name: Install the project
run: |
- python -m pip install --upgrade pip
- pip install -r dev-requirements.txt -r requirements.txt
- python -m spacy download en_core_web_sm
+ uv sync --all-extras --dev
+ uv pip install pip
+ uv run python -m spacy download en_core_web_sm
- name: Test with tox
run: |
- tox
\ No newline at end of file
+ uv run tox
diff --git a/.gitignore b/.gitignore
index 64049e7..c29a2a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ __pycache__/*
.idea
.venv
conda
+uv.lock
# Package files
*.egg
diff --git a/README.md b/README.md
index 62e9d85..95e703a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# :flashlight: QRMine
+# ๐ QRMine
*/หkรคrmฤซn/*
[](https://www.python.org/)[](https://pypi.python.org/pypi/qrmine/)
@@ -6,11 +6,17 @@

[](https://dermatologist.github.io/nlp-qrmine/)
-QRMine is a suite of qualitative research (QR) data mining tools in Python using Natural Language Processing (NLP) and Machine Learning (ML). QRMine is work in progress. [Read More..](https://nuchange.ca/2017/09/grounded-theory-qualitative-research-python.html)
+Qualitative research involves the collection and analysis of textual data, such as interview transcripts, open-ended survey responses, and field notes. It is often used in social sciences, humanities, and health research to explore complex phenomena and understand human experiences. In addition to textual data, qualitative researchers may also collect quantitative data, such as survey responses or demographic information, to complement their qualitative findings.
-## What it does
+Qualitative research is often characterized by its inductive approach, where researchers aim to generate theories or concepts from the data rather than testing pre-existing hypotheses. This process is known as Grounded Theory, which emphasizes the importance of data-driven analysis and theory development.
-### NLP
+QRMine is a Python package for qualitative research and triangulation of textual and numeric data in Grounded Theory. It provides tools for Natural Language Processing (NLP) and Machine Learning (ML) to analyze qualitative data, such as interview transcripts, and quantitative data, such as survey responses for theorizing.
+
+Version 4.0 is a major update with new features and bug fixes. It moves some of the ML dependencies to an optional install. Version 4.0 is a prelude to version 5.0 that will introduce large language models (LLMs) for qualitative research.
+
+## โจ Features
+
+### ๐ง NLP
* Lists common categories for open coding.
* Create a coding dictionary with categories, properties and dimensions.
* Topic modelling.
@@ -18,9 +24,11 @@ QRMine is a suite of qualitative research (QR) data mining tools in Python using
* Compare two documents/interviews.
* Select documents/interviews by sentiment, category or title for further analysis.
* Sentiment analysis
+* Clusters documents and creates visualizations.
+* Generate (non LLM) summary of documents/interviews.
-### ML
+### ๐ง ML
* Accuracy of a neural network model trained using the data
* Confusion matrix from an support vector machine classifier
* K nearest neighbours of a given record
@@ -28,25 +36,29 @@ QRMine is a suite of qualitative research (QR) data mining tools in Python using
* Principal Component Analysis (PCA)
* Association rules
-## How to install
+## ๐ ๏ธ How to install
-* Requires Python 3.11 and a CPU that support AVX instructions
+* Requires Python 3.11
```text
-pip install uv
-uv pip install qrmine
+pip install qrmine
python -m spacy download en_core_web_sm
```
+* For ML functions (neural networks & SVM), install the optional packages
+```text
+pip install qrmine[ml]
+```
+
### Mac users
* Mac users, please install *libomp* for XGBoost
```
brew install libomp
```
-## How to Use
+## ๐ How to Use
-* input files are transcripts as txt files and a single csv file with numeric data. The output txt file can be specified.
+* Input files are transcripts as txt/pdf files and (optionally) a single csv file with numeric data. The output txt file can be specified. All transcripts can be in a single file separated by a break tag as described below.
* The coding dictionary, topics and topic assignments can be created from the entire corpus (all documents) using the respective command line options.
@@ -140,33 +152,15 @@ index, obesity, bmi, exercise, income, bp, fbs, has_diabetes
## Author
-* [Bell Eapen](https://nuchange.ca) (McMaster U) | [Contact](https://nuchange.ca/contact) | [](https://twitter.com/beapen)
+* [Bell Eapen](https://nuchange.ca) ([UIS](https://www.uis.edu/directory/bell-punneliparambil-eapen)) | [Contact](https://nuchange.ca/contact) | [](https://twitter.com/beapen)
-* This software is developed and tested using [Compute Canada](http://www.computecanada.ca) resources.
-* See also: [:fire: The FHIRForm framework for managing healthcare eForms](https://github.com/E-Health/fhirform)
-* See also: [:eyes: Drishti | An mHealth sense-plan-act framework!](https://github.com/E-Health/drishti)
## Citation
-Please cite QRMine in your publications if it helped your research. Here
-is an example BibTeX entry [(Read paper on arXiv)](https://arxiv.org/abs/2003.13519):
-
-```
-
-@article{eapenbr2019qrmine,
- title={QRMine: A python package for triangulation in Grounded Theory},
- author={Eapen, Bell Raj and Archer, Norm and Sartpi, Kamran},
- journal={arXiv preprint arXiv:2003.13519 },
- year={2020}
-}
-
-```
-
-QRMine is inspired by [this work](https://github.com/lknelson/computational-grounded-theory) and the associated [paper](https://journals.sagepub.com/doi/abs/10.1177/0049124117729703).
+Please cite QRMine in your publications if it helped your research.
+Citation information will be available soon.
## Give us a star โญ๏ธ
If you find this project useful, give us a star. It helps others discover the project.
-## Demo
-[](https://github.com/dermatologist/nlp-qrmine/blob/develop/notes/qrmine.gif)
diff --git a/dev-requirements.in b/dev-requirements.in
deleted file mode 100644
index 2b56355..0000000
--- a/dev-requirements.in
+++ /dev/null
@@ -1,11 +0,0 @@
-# dev-requirements.in
--c requirements.txt
-pytest-cov
-pytest
-recommonmark
-sphinx>=3.2.1
-setuptools
-setuptools_scm
-wheel>=0.37.0 # conflicts with dependency of tensorflow
-tox
-pip-tools
\ No newline at end of file
diff --git a/dev-requirements.txt b/dev-requirements.txt
deleted file mode 100644
index f36f95c..0000000
--- a/dev-requirements.txt
+++ /dev/null
@@ -1,146 +0,0 @@
-# This file was autogenerated by uv via the following command:
-# uv pip compile dev-requirements.in -o dev-requirements.txt --universal
-alabaster==1.0.0
- # via sphinx
-babel==2.16.0
- # via sphinx
-build==1.2.2.post1
- # via pip-tools
-cachetools==5.5.0
- # via
- # -c requirements.txt
- # tox
-certifi==2024.8.30
- # via
- # -c requirements.txt
- # requests
-chardet==5.2.0
- # via tox
-charset-normalizer==3.4.0
- # via
- # -c requirements.txt
- # requests
-click==8.1.7
- # via
- # -c requirements.txt
- # pip-tools
-colorama==0.4.6
- # via
- # -c requirements.txt
- # build
- # click
- # pytest
- # sphinx
- # tox
-commonmark==0.9.1
- # via recommonmark
-coverage==7.6.4
- # via pytest-cov
-distlib==0.3.9
- # via virtualenv
-docutils==0.21.2
- # via
- # recommonmark
- # sphinx
-filelock==3.16.1
- # via
- # tox
- # virtualenv
-idna==3.10
- # via
- # -c requirements.txt
- # requests
-imagesize==1.4.1
- # via sphinx
-iniconfig==2.0.0
- # via pytest
-jinja2==3.1.4
- # via
- # -c requirements.txt
- # sphinx
-markupsafe==3.0.2
- # via
- # -c requirements.txt
- # jinja2
-packaging==24.2
- # via
- # -c requirements.txt
- # build
- # pyproject-api
- # pytest
- # setuptools-scm
- # sphinx
- # tox
-pip==24.3.1
- # via pip-tools
-pip-tools==7.4.1
- # via -r dev-requirements.in
-platformdirs==4.3.6
- # via
- # tox
- # virtualenv
-pluggy==1.5.0
- # via
- # pytest
- # tox
-pygments==2.18.0
- # via
- # -c requirements.txt
- # sphinx
-pyproject-api==1.8.0
- # via tox
-pyproject-hooks==1.2.0
- # via
- # build
- # pip-tools
-pytest==8.3.3
- # via
- # -r dev-requirements.in
- # pytest-cov
-pytest-cov==6.0.0
- # via -r dev-requirements.in
-recommonmark==0.7.1
- # via -r dev-requirements.in
-requests==2.32.3
- # via
- # -c requirements.txt
- # sphinx
-setuptools==75.3.0
- # via
- # -c requirements.txt
- # -r dev-requirements.in
- # pip-tools
- # setuptools-scm
-setuptools-scm==8.1.0
- # via -r dev-requirements.in
-snowballstemmer==2.2.0
- # via sphinx
-sphinx==8.1.3
- # via
- # -r dev-requirements.in
- # recommonmark
-sphinxcontrib-applehelp==2.0.0
- # via sphinx
-sphinxcontrib-devhelp==2.0.0
- # via sphinx
-sphinxcontrib-htmlhelp==2.1.0
- # via sphinx
-sphinxcontrib-jsmath==1.0.1
- # via sphinx
-sphinxcontrib-qthelp==2.0.0
- # via sphinx
-sphinxcontrib-serializinghtml==2.0.0
- # via sphinx
-tox==4.23.2
- # via -r dev-requirements.in
-urllib3==2.2.3
- # via
- # -c requirements.txt
- # requests
-virtualenv==20.27.1
- # via tox
-wheel==0.45.0
- # via
- # -c requirements.txt
- # -r dev-requirements.in
- # pip-tools
diff --git a/notes/conda.md b/notes/conda.md
new file mode 100644
index 0000000..79eb6c8
--- /dev/null
+++ b/notes/conda.md
@@ -0,0 +1,12 @@
+conda create --name qrmine python=3.11
+conda activate qrmine
+
+conda install conda-forge::uv
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
+uv pip install -e .
+python -m spacy download en_core_web_sm
+
+
+
+pip3 install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
\ No newline at end of file
diff --git a/notes/new-process.md b/notes/new-process.md
new file mode 100644
index 0000000..08b6584
--- /dev/null
+++ b/notes/new-process.md
@@ -0,0 +1,35 @@
+conda install conda-forge::uv
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
+
+delete setup.cpg
+delete requirements.txt, dev-requirements.txt, dev-requirements.in
+remove deps from tox.ini
+
+uv pip install -e .
+see pr.yml for GitHub actions
+see pyproject.toml for pytorch cpu install
+uv pip install -e .
+
+uv sync --all-extras --dev
+uv pip install pip
+uv run python -m spacy download en_core_web_sm
+
+pyproject.toml
+===============
+requires = ["setuptools>=61.2", "wheel", "pip"]
+license = "GPL-3.0" #This should be a string
+dev = [
+ "setuptools",
+ "setuptools_scm",
+ "pytest",
+ "pytest-cov",
+ "tox",
+ "black",
+ "recommonmark",
+ "sphinx",
+ "wheel",
+ "twine",
+ "tox",
+]
+
diff --git a/notes/pip-tools.md b/notes/pip-tools.md
index da4baa4..c504a1e 100644
--- a/notes/pip-tools.md
+++ b/notes/pip-tools.md
@@ -21,4 +21,7 @@ OR
* pip install uv
* uv pip compile setup.cfg -o requirements.txt --universal
-* uv pip compile dev-requirements.in -o dev-requirements.txt --universal
\ No newline at end of file
+* uv pip compile dev-requirements.in -o dev-requirements.txt --universal
+
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 89a5bed..397a880 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,192 @@
[build-system]
-# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD!
-requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"]
+requires = ["setuptools", "wheel", "pip"]
build-backend = "setuptools.build_meta"
-[tool.setuptools_scm]
-# For smarter version schemes and other configuration options,
-# check out https://github.com/pypa/setuptools_scm
-version_scheme = "no-guess-dev"
+[project]
+name = "qrmine"
+description = "Qualitative Research support tools in Python!"
+authors = [{name = "beapen", email = "github@gulfdoctor.net"}]
+license = "GPL-3.0"
+# license_files = LICENSE.txt
+# long_description = file: README.rst
+# long_description_content_type = text/x-rst; charset=UTF-8
+classifiers = [
+ "Intended Audience :: Science/Research",
+ "Development Status :: 4 - Beta",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3.11",
+ "Topic :: Scientific/Engineering :: Information Analysis",
+]
+requires-python = ">=3.11, <3.12"
+dependencies = [
+ 'importlib-metadata; python_version<"3.8"',
+ "pandas",
+ "mlxtend",
+ "matplotlib",
+ "click",
+ "vaderSentiment",
+ "spacy",
+ "textacy",
+ "pypdf",
+ "requests",
+ "gensim",
+ "seaborn",
+ "wordcloud",
+ "tabulate",
+]
+dynamic = ["version"]
+
+[project.readme]
+file = "README.md"
+content-type = "text/markdown"
+# Add here related links, for example:
+
+[project.urls]
+Homepage = "https://github.com/dermatologist/nlp-qrmine"
+Documentation = "https://arxiv.org/abs/2003.13519"
+# Source = https://github.com/pyscaffold/pyscaffold/
+# Changelog = https://pyscaffold.org/en/latest/changelog.html
+# Tracker = https://github.com/pyscaffold/pyscaffold/issues
+# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
+# Download = https://pypi.org/project/PyScaffold/#files
+# Twitter = https://twitter.com/PyScaffold
+# Change if running only on Windows, Mac or Linux (comma-separated)
+# Add here all kinds of additional classifiers as defined under
+# https://pypi.org/classifiers/
+
+[project.optional-dependencies]
+# Add here additional requirements for extra features, to install with:
+# `pip install qrmine[PDF]` like:
+# PDF = ReportLab; RXP
+# Add here test requirements (semicolon/line-separated)
+testing = [
+ "setuptools",
+ "pytest",
+ "pytest-cov",
+]
+
+dev = [
+ "setuptools>=77.0.0",
+ "packaging>=24.2",
+ "setuptools_scm",
+ "pytest",
+ "pytest-cov",
+ "tox",
+ "black",
+ "recommonmark",
+ "sphinx",
+ "wheel",
+ "twine",
+ "build",
+]
+
+ml = [
+ "scikit-learn",
+ "imbalanced-learn",
+ "xgboost",
+ "torch==2.2.2",
+]
+
+[project.entry-points]
+# Add here console scripts like:
+# console_scripts =
+# script_name = qrmine.module:function
+# For example:
+# console_scripts =
+# fibonacci = qrmine.skeleton:run
+# And any other entry points, for example:
+# pyscaffold.cli =
+# awesome = pyscaffoldext.awesome.extension:AwesomeExtension
+
+[project.scripts]
+qrmine = "qrmine.main:main_routine"
+
+[tool.setuptools]
+zip-safe = false
+include-package-data = true
+package-dir = {"" = "src"}
+# Require a min/specific Python version (comma-separated conditions)
+# python_requires = >=3.8
+# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
+# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
+# new major versions. This works if the required packages follow Semantic Versioning.
+# For more information, check out https://semver.org/.
+platforms = ["any"]
+
+
+[tool.setuptools.packages.find]
+where = [ "src"]
+exclude = [ "tests", "notes", "docs", "notebooks"]
+namespaces = true
+
+[tool.pytest.ini_options]
+# Specify command line options as you would do when invoking pytest directly.
+# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
+# in order to write a coverage file that can be read by Jenkins.
+# CAUTION: --cov flags may prohibit setting breakpoints while debugging.
+# Comment those flags to avoid this pytest issue.
+addopts = """
+--verbose"""
+norecursedirs = [
+ "dist",
+ "build",
+ ".tox",
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+ { index = "pytorch-cpu" },
+]
+torchvision = [
+ { index = "pytorch-cpu" },
+]
+
+[tool.aliases]
+release = "sdist bdist_wheel upload"
+
+[tool.distutils.bdist_wheel]
+# Use this option if your package is pure-python
+universal = 0
+
+[tool.build_sphinx]
+source_dir = "docs"
+build_dir = "docs/_build"
+testpaths = "tests"
+# Use pytest markers to select/deselect specific tests
+# markers =
+# slow: mark tests as slow (deselect with '-m "not slow"')
+# system: mark end-to-end system tests
+
+[tool.devpi.upload]
+# Options for the devpi: PyPI server and packaging tool
+# VCS export must be deactivated since we are using setuptools-scm
+no_vcs = "1"
+formats = "bdist_wheel"
+
+[tool.flake8]
+# Some sane defaults for the code style checker flake8
+max_line_length = "88"
+extend_ignore = "E203, W503"
+# ^ Black-compatible
+# E203 and W503 have edge cases handled by black
+exclude = """
+.tox
+build
+dist
+.eggs
+docs/conf.py"""
+
+[tool.pyscaffold]
+# PyScaffold's parameters when the project was created.
+# This will be used when updating. Do not change!
+version = "4.6"
+package = "qrmine"
+# This file is used to configure your project.
+# Read more about the various options under:
+# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
+# https://setuptools.pypa.io/en/latest/references/keywords.html
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 260d413..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,314 +0,0 @@
-# This file was autogenerated by uv via the following command:
-# uv pip compile setup.cfg -o requirements.txt --universal
-absl-py==2.1.0
- # via
- # tensorboard
- # tensorflow
-astunparse==1.6.3
- # via tensorflow
-blis==0.7.11
- # via thinc
-cachetools==5.5.0
- # via
- # google-auth
- # textacy
-catalogue==2.0.10
- # via
- # spacy
- # srsly
- # textacy
- # thinc
-certifi==2024.8.30
- # via requests
-charset-normalizer==3.4.0
- # via requests
-click==8.1.7
- # via
- # qrmine (setup.cfg)
- # typer
-cloudpathlib==0.20.0
- # via weasel
-colorama==0.4.6 ; sys_platform == 'win32' or platform_system == 'Windows'
- # via
- # click
- # tqdm
- # wasabi
-confection==0.1.5
- # via
- # thinc
- # weasel
-contourpy==1.3.0
- # via matplotlib
-cycler==0.12.1
- # via matplotlib
-cymem==2.0.8
- # via
- # preshed
- # spacy
- # thinc
-cytoolz==1.0.0
- # via textacy
-flatbuffers==24.3.25
- # via tensorflow
-floret==0.10.5
- # via textacy
-fonttools==4.54.1
- # via matplotlib
-gast==0.4.0
- # via tensorflow
-google-auth==2.36.0
- # via
- # google-auth-oauthlib
- # tensorboard
-google-auth-oauthlib==1.0.0
- # via tensorboard
-google-pasta==0.2.0
- # via tensorflow
-grpcio==1.67.1
- # via
- # tensorboard
- # tensorflow
-h5py==3.12.1
- # via tensorflow
-idna==3.10
- # via requests
-imbalanced-learn==0.12.4
- # via qrmine (setup.cfg)
-jellyfish==1.1.0
- # via textacy
-jinja2==3.1.6
- # via spacy
-joblib==1.4.2
- # via
- # imbalanced-learn
- # mlxtend
- # scikit-learn
- # textacy
-keras==2.13.1
- # via tensorflow
-kiwisolver==1.4.7
- # via matplotlib
-langcodes==3.4.1
- # via spacy
-language-data==1.2.0
- # via langcodes
-libclang==18.1.1
- # via tensorflow
-marisa-trie==1.2.1
- # via language-data
-markdown==3.7
- # via tensorboard
-markdown-it-py==3.0.0
- # via rich
-markupsafe==3.0.2
- # via
- # jinja2
- # werkzeug
-matplotlib==3.9.2
- # via
- # qrmine (setup.cfg)
- # mlxtend
-mdurl==0.1.2
- # via markdown-it-py
-mlxtend==0.23.2
- # via qrmine (setup.cfg)
-murmurhash==1.0.10
- # via
- # preshed
- # spacy
- # thinc
-networkx==3.4.2
- # via textacy
-numpy==1.24.3
- # via
- # blis
- # contourpy
- # floret
- # h5py
- # imbalanced-learn
- # matplotlib
- # mlxtend
- # pandas
- # scikit-learn
- # scipy
- # spacy
- # tensorboard
- # tensorflow
- # textacy
- # thinc
- # xgboost
-nvidia-nccl-cu12==2.23.4 ; platform_machine != 'aarch64' and platform_system == 'Linux'
- # via xgboost
-oauthlib==3.2.2
- # via requests-oauthlib
-opt-einsum==3.4.0
- # via tensorflow
-packaging==24.2
- # via
- # matplotlib
- # spacy
- # tensorflow
- # thinc
- # weasel
-pandas==2.1.0 ; python_full_version >= '3.12'
- # via
- # qrmine (setup.cfg)
- # mlxtend
-pandas==2.2.3 ; python_full_version < '3.12'
- # via
- # qrmine (setup.cfg)
- # mlxtend
-pillow==11.0.0
- # via matplotlib
-preshed==3.0.9
- # via
- # spacy
- # thinc
-protobuf==4.25.5
- # via
- # tensorboard
- # tensorflow
-pyasn1==0.6.1
- # via
- # pyasn1-modules
- # rsa
-pyasn1-modules==0.4.1
- # via google-auth
-pydantic==1.10.19
- # via
- # confection
- # spacy
- # thinc
- # weasel
-pygments==2.18.0
- # via rich
-pyparsing==3.2.0
- # via matplotlib
-pyphen==0.17.0
- # via textacy
-python-dateutil==2.9.0.post0
- # via
- # matplotlib
- # pandas
-pytz==2024.2
- # via pandas
-requests==2.32.3
- # via
- # requests-oauthlib
- # spacy
- # tensorboard
- # textacy
- # vadersentiment
- # weasel
-requests-oauthlib==2.0.0
- # via google-auth-oauthlib
-rich==13.9.4
- # via typer
-rsa==4.9
- # via google-auth
-scikit-learn==1.5.2
- # via
- # qrmine (setup.cfg)
- # imbalanced-learn
- # mlxtend
- # textacy
-scipy==1.14.1
- # via
- # imbalanced-learn
- # mlxtend
- # scikit-learn
- # textacy
- # xgboost
-setuptools==75.3.0
- # via
- # marisa-trie
- # spacy
- # tensorboard
- # tensorflow
- # thinc
-shellingham==1.5.4
- # via typer
-six==1.16.0
- # via
- # astunparse
- # google-pasta
- # python-dateutil
- # tensorflow
-smart-open==7.0.5
- # via weasel
-spacy==3.7.5
- # via
- # qrmine (setup.cfg)
- # textacy
-spacy-legacy==3.0.12
- # via spacy
-spacy-loggers==1.0.5
- # via spacy
-srsly==2.4.8
- # via
- # confection
- # spacy
- # thinc
- # weasel
-tensorboard==2.13.0
- # via tensorflow
-tensorboard-data-server==0.7.2
- # via tensorboard
-tensorflow==2.13.1
- # via qrmine (setup.cfg)
-tensorflow-estimator==2.13.0
- # via tensorflow
-tensorflow-io-gcs-filesystem==0.31.0
- # via
- # qrmine (setup.cfg)
- # tensorflow
-termcolor==2.5.0
- # via tensorflow
-textacy==0.13.0
- # via qrmine (setup.cfg)
-thinc==8.2.5
- # via spacy
-threadpoolctl==3.5.0
- # via
- # imbalanced-learn
- # scikit-learn
-toolz==1.0.0
- # via cytoolz
-tqdm==4.67.0
- # via
- # spacy
- # textacy
-typer==0.13.0
- # via
- # spacy
- # weasel
-typing-extensions==4.5.0
- # via
- # pydantic
- # tensorflow
- # typer
-tzdata==2024.2
- # via pandas
-urllib3==2.2.3
- # via requests
-vadersentiment==3.3.2
- # via qrmine (setup.cfg)
-wasabi==1.1.3
- # via
- # spacy
- # thinc
- # weasel
-weasel==0.4.1
- # via spacy
-werkzeug==3.1.3
- # via tensorboard
-wheel==0.45.0
- # via
- # astunparse
- # tensorboard
-wrapt==1.16.0
- # via
- # smart-open
- # tensorflow
-xgboost==2.1.2
- # via qrmine (setup.cfg)
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index e6953b9..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,152 +0,0 @@
-# This file is used to configure your project.
-# Read more about the various options under:
-# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
-# https://setuptools.pypa.io/en/latest/references/keywords.html
-
-[metadata]
-name = qrmine
-description = Qualitative Research support tools in Python!
-author = beapen
-author_email = github@gulfdoctor.net
-license = GPL-3.0-only
-# license_files = LICENSE.txt
-# long_description = file: README.rst
-# long_description_content_type = text/x-rst; charset=UTF-8
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/dermatologist/nlp-qrmine
-# Add here related links, for example:
-project_urls =
- Documentation = https://arxiv.org/abs/2003.13519
-# Source = https://github.com/pyscaffold/pyscaffold/
-# Changelog = https://pyscaffold.org/en/latest/changelog.html
-# Tracker = https://github.com/pyscaffold/pyscaffold/issues
-# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
-# Download = https://pypi.org/project/PyScaffold/#files
-# Twitter = https://twitter.com/PyScaffold
-
-# Change if running only on Windows, Mac or Linux (comma-separated)
-platforms = any
-
-# Add here all kinds of additional classifiers as defined under
-# https://pypi.org/classifiers/
-classifiers =
- Intended Audience :: Science/Research
- Development Status :: 4 - Beta
- Operating System :: OS Independent
- Programming Language :: Python :: 3.11
- Topic :: Scientific/Engineering :: Information Analysis
-
-
-[options]
-zip_safe = False
-packages = find_namespace:
-include_package_data = True
-package_dir =
- =src
-
-# Require a min/specific Python version (comma-separated conditions)
-# python_requires = >=3.8
-
-# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
-# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
-# new major versions. This works if the required packages follow Semantic Versioning.
-# For more information, check out https://semver.org/.
-install_requires =
- importlib-metadata; python_version<"3.8"
- pandas
- matplotlib
- click
- scikit-learn
- imbalanced-learn
- vaderSentiment
- xgboost
- mlxtend
- spacy
- textacy
- tensorflow<=2.13.1
- tensorflow-io-gcs-filesystem<=0.31.0
-
-[options.packages.find]
-where = src
-exclude =
- tests
-
-[options.extras_require]
-# Add here additional requirements for extra features, to install with:
-# `pip install qrmine[PDF]` like:
-# PDF = ReportLab; RXP
-
-# Add here test requirements (semicolon/line-separated)
-testing =
- setuptools
- pytest
- pytest-cov
-
-[options.entry_points]
-# Add here console scripts like:
-# console_scripts =
-# script_name = qrmine.module:function
-# For example:
-# console_scripts =
-# fibonacci = qrmine.skeleton:run
-# And any other entry points, for example:
-# pyscaffold.cli =
-# awesome = pyscaffoldext.awesome.extension:AwesomeExtension
-console_scripts =
- qrmine = qrmine.main:main_routine
-
-[tool:pytest]
-# Specify command line options as you would do when invoking pytest directly.
-# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
-# in order to write a coverage file that can be read by Jenkins.
-# CAUTION: --cov flags may prohibit setting breakpoints while debugging.
-# Comment those flags to avoid this pytest issue.
-addopts =
- --verbose
-norecursedirs =
- dist
- build
- .tox
-
-[aliases]
-release = sdist bdist_wheel upload
-
-[bdist_wheel]
-# Use this option if your package is pure-python
-universal = 1
-
-[build_sphinx]
-source_dir = docs
-build_dir = docs/_build
-
-testpaths = tests
-# Use pytest markers to select/deselect specific tests
-# markers =
-# slow: mark tests as slow (deselect with '-m "not slow"')
-# system: mark end-to-end system tests
-
-[devpi:upload]
-# Options for the devpi: PyPI server and packaging tool
-# VCS export must be deactivated since we are using setuptools-scm
-no_vcs = 1
-formats = bdist_wheel
-
-[flake8]
-# Some sane defaults for the code style checker flake8
-max_line_length = 88
-extend_ignore = E203, W503
-# ^ Black-compatible
-# E203 and W503 have edge cases handled by black
-exclude =
- .tox
- build
- dist
- .eggs
- docs/conf.py
-
-[pyscaffold]
-# PyScaffold's parameters when the project was created.
-# This will be used when updating. Do not change!
-version = 4.6
-package = qrmine
diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py
index 09a4e35..3549721 100644
--- a/src/qrmine/__init__.py
+++ b/src/qrmine/__init__.py
@@ -6,6 +6,8 @@
from .readfiles import ReadData
from .sentiment import Sentiment
from .mlqrmine import MLQRMine
+from .cluster import ClusterDocs
+from .visualize import QRVisualize
if sys.version_info[:2] >= (3, 8):
# TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
new file mode 100644
index 0000000..e67434c
--- /dev/null
+++ b/src/qrmine/cluster.py
@@ -0,0 +1,273 @@
+"""
+Copyright (C) 2025 Bell Eapen
+
+This file is part of qrmine.
+
+qrmine is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+qrmine is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with qrmine. If not, see .
+"""
+
+
+import pandas as pd
+import numpy as np
+from gensim import corpora
+from gensim.models.ldamodel import LdaModel
+from gensim.models import Word2Vec
+from sklearn.manifold import TSNE
+from sklearn.cluster import KMeans
+from tabulate import tabulate
+
+from .content import Content
+
+class ClusterDocs:
+
+ def __init__(self, content: Content, documents = [], titles=[]):
+ self._content = content
+ self._documents = documents
+ self._titles = titles
+ self._num_topics = 5
+ self._passes = 15
+ self._dictionary = None
+ self._corpus = None
+ self._lda_model = None
+ self._corpus = None
+ # Apply preprocessing to each document
+ self._processed_docs = [self.preprocess(doc) for doc in documents]
+ self.process()
+
+ @property
+ def documents(self):
+ return self._documents
+
+ @property
+ def titles(self):
+ return self._titles
+
+ @property
+ def num_topics(self):
+ return self._num_topics
+
+ @property
+ def passes(self):
+ return self._passes
+
+ @property
+ def processed_docs(self):
+ return self._processed_docs
+
+ @property
+ def lda_model(self):
+ return self._lda_model
+
+ @property
+ def corpus(self):
+ return self._corpus
+
+ @documents.setter
+ def documents(self, documents):
+ self._documents = documents
+ self._processed_docs = [self.preprocess(doc) for doc in documents]
+ self.process()
+
+ @titles.setter
+ def titles(self, titles):
+ self._titles = titles
+
+ @num_topics.setter
+ def num_topics(self, num_topics):
+ self._num_topics = num_topics
+
+ @passes.setter
+ def passes(self, passes):
+ self._passes = passes
+
+ # Preprocess the documents using spaCy
+ def preprocess(self, doc):
+ self._content.content = doc
+ return self._content.tokens
+
+ def process(self):
+ # Create a dictionary representation of the documents
+ self._dictionary = corpora.Dictionary(self._processed_docs)
+ # Create a bag-of-words representation of the documents
+ self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs]
+ # Build the LDA (Latent Dirichlet Allocation) model
+
+ def build_lda_model(self):
+ if self._lda_model is None:
+ self._lda_model = LdaModel(
+ self._corpus,
+ num_topics=self._num_topics,
+ id2word=self._dictionary,
+ passes=self._passes,
+ )
+ return self._lda_model.show_topics(formatted=False)
+
+ def print_topics(self, num_words=5):
+ if self._lda_model is None:
+ self.build_lda_model()
+ # Print the topics and their corresponding words
+ # print(self._lda_model.print_topics(num_words=num_words))
+ output = self._lda_model.print_topics(num_words=num_words)
+ """ Output is like:
+ [(0, '0.116*"category" + 0.093*"comparison" + 0.070*"incident" + 0.060*"theory" + 0.025*"Theory"'), (1, '0.040*"GT" + 0.026*"emerge" + 0.026*"pragmatic" + 0.026*"Barney" + 0.026*"contribution"'), (2, '0.084*"theory" + 0.044*"GT" + 0.044*"evaluation" + 0.024*"structure" + 0.024*"Glaser"'), (3, '0.040*"open" + 0.040*"QRMine" + 0.040*"coding" + 0.040*"category" + 0.027*"researcher"'), (4, '0.073*"coding" + 0.046*"structure" + 0.045*"GT" + 0.042*"Strauss" + 0.038*"Corbin"')]
+ format this into human readable format as below:
+ Topic 0: category(0.116), comparison(0.093), incident(0.070), theory(0.060), Theory(0.025)
+ """
+ print("\nTopics: \n")
+ for topic in output:
+ topic_num = topic[0]
+ topic_words = topic[1]
+ words = []
+ for word in topic_words.split("+"):
+ word = word.split("*")
+ words.append(f"{word[1].strip()}({word[0].strip()})")
+ print(f"Topic {topic_num}: {', '.join(words)}")
+ return output
+
+ def print_clusters(self):
+ if self._lda_model is None:
+ self.build_lda_model()
+ # Perform semantic clustering
+ print("\n Main topic in doc: \n")
+
+ for i, doc in enumerate(
+ self._processed_docs
+ ): # Changed from get_processed_docs() to _documents
+ bow = self._dictionary.doc2bow(doc)
+ print(
+ f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}"
+ )
+
+ def format_topics_sentences(self, visualize=False):
+ self.build_lda_model()
+ # Init output
+ sent_topics_df = pd.DataFrame()
+
+ # Get main topic in each document
+ for i, row_list in enumerate(self._lda_model[self._corpus]):
+ row = row_list[0] if self._lda_model.per_word_topics else row_list
+ # print(row)
+ row = sorted(row, key=lambda x: (x[1]), reverse=True)
+ # Get the Dominant topic, Perc Contribution and Keywords for each document
+ for j, (topic_num, prop_topic) in enumerate(row):
+ if j == 0: # => dominant topic
+ wp = self._lda_model.show_topic(topic_num)
+ topic_keywords = ", ".join([word for word, prop in wp])
+ new_row = pd.DataFrame(
+ [[self._titles[i], int(topic_num), round(prop_topic, 4), topic_keywords]],
+ columns=[
+ "Title",
+ "Dominant_Topic",
+ "Perc_Contribution",
+ "Topic_Keywords",
+ ],
+ )
+ sent_topics_df = pd.concat(
+ [sent_topics_df, new_row], ignore_index=True
+ )
+ else:
+ break
+ sent_topics_df.columns = [
+ "Title",
+ "Dominant_Topic",
+ "Perc_Contribution",
+ "Topic_Keywords",
+ ]
+
+ # Add original text to the end of the output
+ if visualize:
+ contents = pd.Series(self._processed_docs)
+ sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
+ return sent_topics_df.reset_index(drop=False)
+
+ # https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
+ def most_representative_docs(self):
+ sent_topics_df = self.format_topics_sentences()
+ sent_topics_sorteddf_mallet = pd.DataFrame()
+ sent_topics_outdf_grpd = sent_topics_df.groupby("Dominant_Topic")
+
+ for i, grp in sent_topics_outdf_grpd:
+ sent_topics_sorteddf_mallet = pd.concat(
+ [
+ sent_topics_sorteddf_mallet,
+ grp.sort_values(["Perc_Contribution"], ascending=False).head(1),
+ ],
+ axis=0,
+ )
+
+ return sent_topics_sorteddf_mallet
+
+ def topics_per_document(self, start=0, end=1):
+ corpus_sel = self._corpus[start:end]
+ dominant_topics = []
+ topic_percentages = []
+ for i, corp in enumerate(corpus_sel):
+ topic_percs = self._lda_model[corp]
+ dominant_topic = sorted(topic_percs, key=lambda x: x[1], reverse=True)[0][0]
+ dominant_topics.append((i, dominant_topic))
+ topic_percentages.append(topic_percs)
+ return (dominant_topics, topic_percentages)
+
+ # Get average embedding vector for each text
+
+ def doc_vectorizer(self, doc, model):
+ doc_vector = []
+ num_words = 0
+ for word in doc:
+ try:
+ if num_words == 0:
+ doc_vector = model.wv[word]
+ else:
+ doc_vector = np.add(doc_vector, model.wv[word])
+ num_words += 1
+ except:
+ # pass if word is not found
+ pass
+
+ return np.asarray(doc_vector) / num_words
+
+ def vectorizer(self, docs, titles, num_clusters=4, visualize=False):
+ X = []
+ T = []
+ model = Word2Vec(docs, min_count=20, vector_size=50)
+ for index, doc in enumerate(docs):
+ X.append(self.doc_vectorizer(doc, model))
+ T.append(titles[index])
+ print('Averaged text w2v representstion:')
+ print(X[0])
+ _X = np.array(X)
+ print(_X.shape)
+ tsne = TSNE(n_components=2, random_state=0)
+ tsne_model = tsne.fit_transform(_X)
+ # Obtain the prediction
+ kmeans = KMeans(n_clusters=num_clusters, random_state=0)
+ y_pred = kmeans.fit(tsne_model).predict(tsne_model)
+ data = pd.DataFrame(
+ np.concatenate([tsne_model, y_pred[:, None]], axis=1),
+ columns=["x", "y", "colour"],
+ )
+ # Add the titles to the DataFrame
+ data["title"] = T
+ if not visualize:
+ print(
+ tabulate(
+ data,
+ headers="keys",
+ tablefmt="psql",
+ showindex=False,
+ numalign="left",
+ stralign="left",
+ )
+ )
+ return data
diff --git a/src/qrmine/content.py b/src/qrmine/content.py
index 3344a80..c67285f 100644
--- a/src/qrmine/content.py
+++ b/src/qrmine/content.py
@@ -1,20 +1,20 @@
"""
- Copyright (C) 2020 Bell Eapen
+Copyright (C) 2020 Bell Eapen
- This file is part of qrmine.
+This file is part of qrmine.
- qrmine is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
+qrmine is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
- qrmine is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
+qrmine is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with qrmine. If not, see .
+You should have received a copy of the GNU General Public License
+along with qrmine. If not, see .
"""
import operator
@@ -23,10 +23,15 @@
import textacy
+
class Content(object):
- def __init__(self, content):
+ def __init__(self, content="", title="", lang="en_core_web_sm", max_length=1100000):
self._content = content
- self._nlp = textacy.load_spacy_lang("en_core_web_sm")
+ # TODO, Title is not used
+ self._title = title
+ self._lang = lang
+ self._nlp = textacy.load_spacy_lang(lang)
+ self._nlp.max_length = max_length
self._processed = self._nlp(self._content)
self._lemma = {}
self._pos = {}
@@ -43,17 +48,21 @@ def __init__(self, content):
def content(self):
return self._content
+ @property
+ def title(self):
+ return self._title
+
@content.setter
def content(self, content):
self._content = content
@property
def lemma(self, token):
- return self._lemma.get(token, '')
+ return self._lemma.get(token, "")
@property
def pos(self, token):
- return self._pos.get(token, '')
+ return self._pos.get(token, "")
@property
def pos_(self, token):
@@ -61,7 +70,7 @@ def pos_(self, token):
@property
def word(self, token):
- return self._word.get(token, '')
+ return self._word.get(token, "")
@property
def sentiment(self, token):
@@ -69,7 +78,7 @@ def sentiment(self, token):
@property
def tag(self, token):
- return self._tag.get(token, '')
+ return self._tag.get(token, "")
@property
def dep(self, token):
@@ -87,6 +96,33 @@ def idx(self, token):
def doc(self):
return self._processed
+ @property
+ def tokens(self):
+ return [
+ token.lemma_
+ for token in self._processed
+ if not token.is_stop and not token.is_punct and not token.is_space
+ ]
+
+ @property
+ def lang(self):
+ return self._lang
+
+ @content.setter
+ def content(self, content):
+ self._content = content
+ self._processed = self._nlp(self._content)
+ self._lemma = {}
+ self._pos = {}
+ self._pos_ = {}
+ self._word = {}
+ self._sentiment = {}
+ self._tag = {}
+ self._dep = {}
+ self._prob = {}
+ self._idx = {}
+ self.process()
+
def process(self):
for token in self._processed:
if token.is_stop or token.is_digit or token.is_punct or token.is_space:
@@ -114,14 +150,14 @@ def common_words(self, index=10):
def common_nouns(self, index=10):
_words = {}
for key, value in self._word.items():
- if self._pos.get(key, None) == 'NOUN':
+ if self._pos.get(key, None) == "NOUN":
_words[value] = _words.get(value, 0) + 1
return sorted(_words.items(), key=operator.itemgetter(1), reverse=True)[:index]
def common_verbs(self, index=10):
_words = {}
for key, value in self._word.items():
- if self._pos.get(key, None) == 'VERB':
+ if self._pos.get(key, None) == "VERB":
_words[value] = _words.get(value, 0) + 1
return sorted(_words.items(), key=operator.itemgetter(1), reverse=True)[:index]
@@ -135,7 +171,9 @@ def sentences_with_common_nouns(self, index=10):
for span in self._processed.sents:
# go from the start to the end of each span, returning each token in the sentence
# combine each token using join()
- sent = ''.join(self._processed[i].string for i in range(span.start, span.end)).strip()
+ sent = "".join(
+ self._processed[i].string for i in range(span.start, span.end)
+ ).strip()
for noun, freq in _nouns:
if noun in sent:
sents.append(sent)
@@ -151,7 +189,7 @@ def spans_with_common_nouns(self, word):
# go from the start to the end of each span, returning each token in the sentence
# combine each token using join()
for token in span:
- if word in self._word.get(token, ' '):
+ if word in self._word.get(token, " "):
spans.append(span)
return spans
@@ -160,11 +198,11 @@ def dimensions(self, word, index=3):
_ad = {}
for span in _spans:
for token in span:
- if self._pos.get(token, None) == 'ADJ':
+ if self._pos.get(token, None) == "ADJ":
_ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
- if self._pos.get(token, None) == 'ADV':
+ if self._pos.get(token, None) == "ADV":
_ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
- if self._pos.get(token, None) == 'VERB':
+ if self._pos.get(token, None) == "VERB":
_ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
return sorted(_ad.items(), key=operator.itemgetter(1), reverse=True)[:index]
@@ -173,7 +211,9 @@ def attributes(self, word, index=3):
_ad = {}
for span in _spans:
for token in span:
- if self._pos.get(token, None) == 'NOUN' and word not in self._word.get(token, ''):
+ if self._pos.get(token, None) == "NOUN" and word not in self._word.get(
+ token, ""
+ ):
_ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
# if self._pos.get(token, None) == 'VERB':
# _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
diff --git a/src/qrmine/main.py b/src/qrmine/main.py
index 374c496..c3c6ace 100644
--- a/src/qrmine/main.py
+++ b/src/qrmine/main.py
@@ -2,6 +2,7 @@
import click
import textacy
+from tabulate import tabulate
from . import Content
from . import Network
@@ -9,70 +10,201 @@
from . import ReadData
from . import Sentiment
from . import MLQRMine
+from . import ClusterDocs
+from .visualize import QRVisualize
+from .utils import QRUtils
from . import __version__
+q = Qrmine()
@click.command()
-@click.option('--verbose', '-v', is_flag=True, help="Will print verbose messages.")
-@click.option('--inp', '-i', multiple=True,
- help='Input file in the text format with Topic')
-@click.option('--out', '-o', multiple=False, default='',
- help='Output file name')
-@click.option('--csv', multiple=False, default='',
- help='csv file name')
-@click.option('--num', '-n', multiple=False, default=3,
- help='N (clusters/epochs etc depending on context)')
-@click.option('--rec', '-r', multiple=False, default=3,
- help='Record (based on context)')
-@click.option('--titles', '-t', multiple=True,
- help='Document(s) or csv title(s) to analyze/compare')
-@click.option('--filters', '-f', multiple=True,
- help='Filters to apply')
-@click.option('--codedict', is_flag=True,
- help='Generate coding dictionary')
-@click.option('--topics', is_flag=True,
- help='Generate topic model')
-@click.option('--assign', is_flag=True,
- help='Assign documents to topics')
-@click.option('--cat', is_flag=True,
- help='List categories of entire corpus or individual docs')
-@click.option('--summary', is_flag=True,
- help='Generate summary for entire corpus or individual docs')
-@click.option('--sentiment', is_flag=True,
- help='Generate sentiment score for entire corpus or individual docs')
-@click.option('--sentence', is_flag=True, default=False,
- help='Generate sentence level scores when applicable')
-@click.option('--nlp', is_flag=True,
- help='Generate all NLP reports')
-@click.option('--nnet', is_flag=True,
- help='Display accuracy of a neural network model')
-@click.option('--svm', is_flag=True,
- help='Display confusion matrix from an svm classifier')
-@click.option('--knn', is_flag=True,
- help='Display nearest neighbours')
-@click.option('--kmeans', is_flag=True,
- help='Display KMeans clusters')
-@click.option('--cart', is_flag=True,
- help='Display Association Rules')
-@click.option('--pca', is_flag=True,
- help='Display PCA')
-def cli(verbose, inp, out, csv, num, rec, titles, filters, codedict, topics, assign, cat, summary, sentiment, sentence,
- nlp, nnet,
- svm,
- knn, kmeans, cart, pca):
+@click.option("--verbose", "-v", is_flag=True, help="Will print verbose messages.")
+@click.option(
+ "--covid", "-cf", default="", help="Download COVID narratives from the website"
+)
+@click.option(
+ "--inp",
+ "-i",
+ multiple=False,
+ help="Input file in the text format with Topic",
+)
+@click.option("--out", "-o", multiple=False, default="", help="Output file name")
+@click.option("--csv", multiple=False, default="", help="csv file name")
+@click.option(
+ "--num",
+ "-n",
+ multiple=False,
+ default=3,
+ help="N (clusters/epochs etc depending on context)",
+)
+@click.option(
+ "--rec", "-r", multiple=False, default=3, help="Record (based on context)"
+)
+@click.option(
+ "--titles",
+ "-t",
+ multiple=True,
+ help="Document(s) or csv title(s) to analyze/compare",
+)
+@click.option("--filters", "-f", multiple=True, help="Filters to apply")
+@click.option("--codedict", is_flag=True, help="Generate coding dictionary")
+@click.option("--topics", is_flag=True, help="Generate topic model")
+@click.option("--assign", is_flag=True, help="Assign documents to topics")
+@click.option(
+ "--cat", is_flag=True, help="List categories of entire corpus or individual docs"
+)
+@click.option(
+ "--summary",
+ is_flag=True,
+ help="Generate summary for entire corpus or individual docs",
+)
+@click.option(
+ "--sentiment",
+ is_flag=True,
+ help="Generate sentiment score for entire corpus or individual docs",
+)
+@click.option(
+ "--sentence",
+ is_flag=True,
+ default=False,
+ help="Generate sentence level scores when applicable",
+)
+@click.option("--nlp", is_flag=True, help="Generate all NLP reports")
+@click.option("--nnet", is_flag=True, help="Display accuracy of a neural network model")
+@click.option(
+ "--svm", is_flag=True, help="Display confusion matrix from an svm classifier"
+)
+@click.option("--knn", is_flag=True, help="Display nearest neighbours")
+@click.option("--kmeans", is_flag=True, help="Display KMeans clusters")
+@click.option("--cart", is_flag=True, help="Display Association Rules")
+@click.option("--pca", is_flag=True, help="Display PCA")
+@click.option("--visualize", '-v', is_flag=False, help="Visualize words, tpopics or wordcloud. ")
+@click.option("--ignore", is_flag=False, help="Comma separated ignore words")
+def cli(
+ verbose,
+ covid,
+ inp,
+ out,
+ csv,
+ num,
+ rec,
+ titles,
+ filters,
+ codedict,
+ topics,
+ assign,
+ cat,
+ summary,
+ sentiment,
+ sentence,
+ nlp,
+ nnet,
+ svm,
+ knn,
+ kmeans,
+ cart,
+ pca,
+ visualize,
+ ignore,
+):
+ if covid:
+ qr_utils = QRUtils()
+ qr_utils.read_covid_narratives(covid)
+ click.echo("COVID narratives downloaded to " + covid)
data = ReadData()
if inp:
- data.read_file(inp)
+ if ignore:
+ data.read_file(inp, ignore)
+ else:
+ data.read_file(inp)
if len(filters) > 0:
data = filter_data(inp, filters, sentence, num)
if verbose:
click.echo("We are in the verbose mode.")
if out:
- sys.stdout = open(out, 'w')
+ sys.stdout = open(out, "w")
if inp and codedict:
generate_dict(data, num)
+ content = Content(data.content)
+ cluster = ClusterDocs(content)
+ cluster.documents = data.documents
+ cluster.titles = data.titles
if inp and topics:
- generate_topics(data, assign, num)
+ # generate_topics(data, assign, num)
+ click.echo("---------------------------")
+ cluster.print_topics()
+ click.echo("---------------------------")
+ click.echo("Dominant topic and its percentage contribution in each document")
+ topics = cluster.format_topics_sentences()
+ click.echo(
+ tabulate(
+ topics,
+ headers="keys",
+ tablefmt="grid",
+ showindex="never",
+ numalign="left",
+ maxcolwidths=[10, 10, 10, 50],
+ )
+ )
+ click.echo("Most representative document for each topic")
+ most_representative_docs = cluster.most_representative_docs()
+ click.echo(
+ tabulate(
+ most_representative_docs,
+ headers="keys",
+ tablefmt="grid",
+ showindex="never",
+ numalign="left",
+ maxcolwidths=[10, 10, 10, 50],
+ )
+ )
+ if visualize:
+ _data = cluster.format_topics_sentences(visualize=True)
+ _topics = cluster.build_lda_model()
+ _processed_docs = cluster.processed_docs
+ _lda_model = cluster.lda_model
+ _corpus = cluster.corpus
+ match visualize:
+ case "wordcloud":
+ v = QRVisualize(data)
+ v.plot_wordcloud(topics=_topics, folder_path=out)
+ case "topics":
+ v = QRVisualize(_data)
+ v.plot_distribution_by_topic(
+ _data, folder_path=out
+ )
+ case "words":
+ v = QRVisualize(_data)
+ v.plot_frequency_distribution_of_words(folder_path=out)
+ case "importance":
+ v = QRVisualize(_data)
+ v.plot_importance(topics=_topics, processed_docs=_processed_docs, folder_path=out)
+ case "sentence":
+ v = QRVisualize(_data)
+ v.sentence_chart(
+ _lda_model, _corpus, folder_path=out
+ )
+ # case "cluster":
+ # v = QRVisualize(_data)
+ # if num:
+ # v.cluster_chart(
+ # _lda_model, _corpus, num, folder_path=out
+ # )
+ # else:
+ # v.cluster_chart(
+ # _lda_model, _corpus, folder_path=out
+ # )
+ case "cluster":
+ v = QRVisualize(_data)
+ for doc in data.documents:
+ print(doc+ "\n")
+ vectors = cluster.vectorizer(data.documents, data.titles, visualize=True)
+ v.cluster_chart(
+ vectors, folder_path=out
+ )
+
+
+
# if inp and assign:
# assign_topics(data)
if inp and cat:
@@ -81,7 +213,9 @@ def cli(verbose, inp, out, csv, num, rec, titles, filters, codedict, topics, ass
generate_summary(data, titles)
if inp and sentiment:
get_sentiment(data, titles, sentence, verbose)
- if inp and cart: #python qrminer.py --cart -i src/qrmine/resources/interview.txt -n 10
+ if (
+ inp and cart
+ ): # python qrminer.py --cart -i src/qrmine/resources/interview.txt -n 10
get_categories_association(data, num)
if inp and nlp:
main(inp)
@@ -128,20 +262,20 @@ def filter_data(inp, search, sentence, num):
filters = []
for s in search:
- if s == 'pos':
+ if s == "pos":
for title in data.titles:
t = [title]
- if get_sentiment(data, t, sentence, False) == 'pos':
+ if get_sentiment(data, t, sentence, False) == "pos":
filters.append(title)
- if s == 'neg':
+ if s == "neg":
for title in data.titles:
t = [title]
- if get_sentiment(data, t, sentence, False) == 'neg':
+ if get_sentiment(data, t, sentence, False) == "neg":
filters.append(title)
- if s == 'neu':
+ if s == "neu":
for title in data.titles:
t = [title]
- if get_sentiment(data, t, sentence, False) == 'neu':
+ if get_sentiment(data, t, sentence, False) == "neu":
filters.append(title)
# If search itself is a title
if any(s in l for l in data.titles):
@@ -173,13 +307,13 @@ def filter_data(inp, search, sentence, num):
def generate_dict(data, num):
if not num:
num = 10
- q = Qrmine()
+
all_interviews = Content(data.content)
q.print_dict(all_interviews, num)
def generate_topics(data, assign, num):
- q = Qrmine()
+
q.content = data
q.process_content()
q.print_topics()
@@ -188,25 +322,26 @@ def generate_topics(data, assign, num):
# def assign_topics(data):
-# q = Qrmine()
+#
# q.content = data
# q.process_content()
# q.print_documents()
+
def get_categories_association(data, num):
- q = Qrmine()
+
q.content = data
click.echo(q.category_association(num))
click.echo("Frequent Itemsets")
click.echo("---------------------------")
+
"""
Function working at both levels
"""
def generate_categories(data, tags, num):
- q = Qrmine()
if len(tags) > 0:
ct = 0
@@ -269,7 +404,9 @@ def get_sentiment(data, tags, sentence, verbose):
if len(sentence) > 3:
sent = s.sentiment_analyzer_scores(sentence.text)
if verbose:
- click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"])))
+ click.echo(
+ "{:-<40} {}\n".format(sent["sentence"], str(sent["score"]))
+ )
click.echo(s.sentiment())
else:
@@ -280,7 +417,7 @@ def get_sentiment(data, tags, sentence, verbose):
return s.sentiment()
else:
all_interviews = Content(data.content)
- doc = textacy.make_spacy_doc(all_interviews.doc)
+ doc = textacy.make_spacy_doc(all_interviews.doc, lang=all_interviews.lang)
## Sentiment
s = Sentiment()
@@ -289,7 +426,9 @@ def get_sentiment(data, tags, sentence, verbose):
if len(sentence) > 3:
sent = s.sentiment_analyzer_scores(sentence.text)
if verbose:
- click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"])))
+ click.echo(
+ "{:-<40} {}\n".format(sent["sentence"], str(sent["score"]))
+ )
click.echo(s.sentiment())
else:
@@ -309,7 +448,9 @@ def get_nnet(ml, n=3):
ml.epochs = n
ml.prepare_data(True) # Oversample
ml.get_nnet_predictions()
- click.echo("\n%s: %.2f%%" % (ml.model.metrics_names[1], ml.get_nnet_scores()[1] * 100))
+ click.echo(
+ "\n%s: %.2f%%" % (ml.model.metrics_names[1], ml.get_nnet_scores()[1] * 100)
+ )
def get_svm(ml):
@@ -348,7 +489,6 @@ def main(input_file):
data = ReadData()
data.read_file(input_file)
- q = Qrmine()
all_interviews = Content(data.content)
q.content = data
@@ -367,7 +507,11 @@ def main(input_file):
x.append(sentence.text)
sent = s.sentiment_analyzer_scores(sentence.text)
click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"])))
- click.echo("{:-<40} {}\n".format(sentence.text, str(s.similarity(sentence.text, "Dummy sentence"))))
+ click.echo(
+ "{:-<40} {}\n".format(
+ sentence.text, str(s.similarity(sentence.text, "Dummy sentence"))
+ )
+ )
## Network
n = Network()
@@ -389,5 +533,5 @@ def main_routine():
cli() # run the main function
-if __name__ == '__main__':
+if __name__ == "__main__":
main_routine()
diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py
index 12b75a3..6bcef33 100644
--- a/src/qrmine/mlqrmine.py
+++ b/src/qrmine/mlqrmine.py
@@ -1,21 +1,49 @@
import numpy
-from imblearn.over_sampling import RandomOverSampler
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense
-from numpy import random, argsort, sqrt, array, ones
from pandas import read_csv
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KDTree
from random import randint
+import logging
+logger = logging.getLogger(__name__)
+ML_INSTALLED = False
+
+try:
+ from xgboost import XGBClassifier
+ from mlxtend.frequent_patterns import apriori
+ from mlxtend.frequent_patterns import association_rules
+
+ import torch.nn as nn
+ import torch.optim as optim
+ import torch
+ from torch.utils.data import DataLoader, TensorDataset
+ from imblearn.over_sampling import RandomOverSampler
+
+ ML_INSTALLED = True
+ class NeuralNet(nn.Module):
+ def __init__(self, input_dim):
+ super(NeuralNet, self).__init__()
+ self.fc1 = nn.Linear(input_dim, 12)
+ self.fc2 = nn.Linear(12, 8)
+ self.fc3 = nn.Linear(8, 1)
+ self.relu = nn.ReLU()
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, x):
+ x = self.relu(self.fc1(x))
+ x = self.relu(self.fc2(x))
+ x = self.sigmoid(self.fc3(x))
+ return x
+except ImportError:
+ logger.info(
+ "ML dependencies are not installed. Please install them by ```pip install qrmine[ml] to use ML features."
+ )
+
-from xgboost import XGBClassifier
-from mlxtend.frequent_patterns import apriori
-from mlxtend.frequent_patterns import association_rules
class MLQRMine(object):
@@ -24,17 +52,18 @@ def __init__(self):
self._seed = randint(1, 9)
self._csvfile = ""
self._titles = None
+ self._model = None
self._dataset = None
self._X = None
self._y = None
self._X_original = None
self._y_original = None
self._dataset_original = None
- self._model = Sequential()
self._sc = StandardScaler()
self._vnum = 0 # Number of variables
- self._classifier = XGBClassifier()
- self._epochs = 10
+ if ML_INSTALLED:
+ self._classifier = XGBClassifier()
+ self._epochs = 1
self._samplesize = 0
self._clusters = None
@@ -130,7 +159,11 @@ def read_xy(self):
def oversample(self):
self._X_original = self._X
self._y_original = self._y
- ros = RandomOverSampler(random_state=0)
+ if ML_INSTALLED:
+ ros = RandomOverSampler(random_state=0)
+ else:
+ logger.info("ML dependencies are not installed. Please install them by ```pip install qrmine[ml] to use ML features.")
+ raise ImportError("ML dependencies are not installed. Please install them by ```pip install qrmine[ml] to use ML features.")
X, y = ros.fit_resample(self._X, self._y)
self._X = X
self._y = y
@@ -147,22 +180,58 @@ def prepare_data(self, oversample=False):
self.oversample()
def get_nnet_predictions(self):
- self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu'))
- self._model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
- self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
- # Compile model
- self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
- # Fit the model
- self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2)
-
- # calculate predictions
- predictions = self._model.predict(self._X_original)
- # round predictions
- rounded = [round(x[0]) for x in predictions]
+
+ self._model = NeuralNet(self._vnum)
+ criterion = nn.BCELoss()
+ optimizer = optim.Adam(self._model.parameters(), lr=0.001)
+
+ # Convert data to PyTorch tensors
+ X_tensor = torch.tensor(self._X, dtype=torch.float32)
+ y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1)
+
+ # Create a dataset and data loader
+ dataset = TensorDataset(X_tensor, y_tensor)
+ dataloader = DataLoader(dataset, batch_size=10, shuffle=True)
+
+ # Train the model
+ for epoch in range(self._epochs):
+ for batch_X, batch_y in dataloader:
+ optimizer.zero_grad()
+ outputs = self._model(batch_X)
+ loss = criterion(outputs, batch_y)
+ loss.backward()
+ optimizer.step()
+
+ # Calculate predictions
+ with torch.no_grad():
+ predictions = self._model(torch.tensor(self._X_original, dtype=torch.float32))
+ rounded = [round(x.item()) for x in predictions]
+ # print("Predictions: ", rounded)
+ # Calculate accuracy
+ correct = sum([1 for i in range(len(rounded)) if rounded[i] == self._y_original[i]])
+ total = len(rounded)
+ accuracy = correct / total
+ print(f'Accuracy: {accuracy * 100:.2f}%')
return rounded
def get_nnet_scores(self):
- return self._model.evaluate(self._X, self._y)
+ # evalute the pytorch model
+ self._model.eval()
+ X_tensor = torch.tensor(self._X, dtype=torch.float32)
+ y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1)
+ dataset = TensorDataset(X_tensor, y_tensor)
+ dataloader = DataLoader(dataset, batch_size=10, shuffle=True)
+ correct = 0
+ total = 0
+ with torch.no_grad():
+ for batch_X, batch_y in dataloader:
+ outputs = self._model(batch_X)
+ predicted = (outputs > 0.5).float()
+ total += batch_y.size(0)
+ correct += (predicted == batch_y).sum().item()
+ accuracy = correct / total
+ print(f'Accuracy: {accuracy * 100:.2f}%')
+ return accuracy
def svm_confusion_matrix(self):
"""Generate confusion matrix for SVM
@@ -211,7 +280,6 @@ def get_centroids(self, c=1):
print("Mean")
print(self._dataset.iloc[cluster_list, :].mean(axis=0))
-
"""
TODO: This is not working yet.
use the ColumnTransformer instead of categorical_features
diff --git a/src/qrmine/nlp_qrmine.py b/src/qrmine/nlp_qrmine.py
index 44209c9..0d9b019 100644
--- a/src/qrmine/nlp_qrmine.py
+++ b/src/qrmine/nlp_qrmine.py
@@ -85,7 +85,7 @@ def category_basket(self, num=10):
item_basket = []
for index, title in enumerate(self._content.titles): # QRMines content should be set
content = self._content.documents[index]
- this_record = Content(content)
+ this_record = Content(content, title)
doc = textacy.make_spacy_doc(this_record.doc, lang=self._en)
item_basket.append(self.print_categories(doc, num))
return item_basket
@@ -115,7 +115,6 @@ def category_association(self, num=10):
# 1 0.833333 (theory)
# 2 0.666667 (theory, GT)
-
def unique(self,list1):
# insert the list to the set
@@ -170,17 +169,21 @@ def print_documents(self, top_n=2):
print(self._corpus.docs[doc_idx]._.meta["title"], ':', topics)
print("---------------------------\n")
- def print_dict(self, content, num=10):
+ def print_dict(self, content, num=10, top_n=5):
output = []
print("\n---Coding Dictionary---")
output.append(("CATEGORY", "PROPERTY", "DIMENSION"))
words = content.common_verbs(num)
+ _words = []
+ for word, f1 in words:
+ _words.append(word)
for word, f1 in words:
- for attribute, f2 in content.attributes(word, 3):
- for dimension, f3 in content.dimensions(attribute, 3):
- output.append((word, attribute, dimension))
- word = '...'
- attribute = '...'
+ for attribute, f2 in content.attributes(word, top_n):
+ for dimension, f3 in content.dimensions(attribute, top_n):
+ if dimension not in _words:
+ output.append((word, attribute, dimension))
+ word = '...'
+ attribute = '...'
self.print_table(output)
print("---------------------------\n")
@@ -195,7 +198,7 @@ def process_content(self):
metadata['title'] = 'Empty'
# self._corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True),
# metadata=metadata)
- #doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True)
+ # doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True)
# 2-Jan-2020 textacy new version, breaking change
# replace numbers with NUM, remove punct and convert to lower case
@@ -216,7 +219,7 @@ def filter_content(self, titles):
# self._corpus.add_text(
# textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True),
# metadata=metadata)
- #doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True)
+ # doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True)
# doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower()
doc_text = preprocessing.replace.numbers(preprocessing.remove.punctuation(document)).lower()
doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en)
diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py
index a460795..187c201 100644
--- a/src/qrmine/readfiles.py
+++ b/src/qrmine/readfiles.py
@@ -1,4 +1,6 @@
import re
+import requests
+from pypdf import PdfReader
class ReadData(object):
@@ -37,48 +39,96 @@ def append(self, title, document):
self._documents.append(document)
self._content += document
- def read_file(self, file_names):
- if len(file_names) > 1:
- for file_name in file_names:
- with open(file_name, 'r') as f:
- read_from_file = f.read()
- self._content = re.sub('<[^<]+?>', '', read_from_file)
- self._documents = re.split('.*?', read_from_file)
- # Delete the last blank record
- del self._documents[-1]
- pattern = r"(.*?)"
- _title = re.findall(pattern, read_from_file, flags=re.DOTALL)[0]
- self._titles.append(_title)
- f.close()
- else:
- file_name = file_names[0]
- with open(file_name, 'r') as f:
+ def read_file(self, input, comma_separated_ignore_words=None):
+ # if input is a file name
+ if input.endswith(".txt"):
+ with open(input, "r") as f:
read_from_file = f.read()
- self._content = re.sub('<[^<]+?>', '', read_from_file)
- self._documents = re.split('.*?', read_from_file)
+ # remove comma separated ignore words
+ if comma_separated_ignore_words:
+ for word in comma_separated_ignore_words.split(","):
+ read_from_file = re.sub(
+ r"\b" + word.strip() + r"\b",
+ "",
+ read_from_file,
+ flags=re.IGNORECASE,
+ )
+ self._content = re.sub("<[^<]+?>", "", read_from_file)
+ self._documents = re.split(".*?", read_from_file)
# Delete the last blank record
del self._documents[-1]
pattern = r"(.*?)"
self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL)
+ # if input is a folder name
+ elif input.endswith("/"):
+ import os
- """
- Combine duplicate topics using Dict
- Currently supported only for single file.
- """
+ for file_name in os.listdir(input):
+ if file_name.endswith(".txt"):
+ with open(os.path.join(input, file_name), "r") as f:
+ read_from_file = f.read()
+ # remove comma separated ignore words
+ if comma_separated_ignore_words:
+ for word in comma_separated_ignore_words.split(","):
+ read_from_file = re.sub(
+ r"\b" + word.strip() + r"\b",
+ "",
+ read_from_file,
+ flags=re.IGNORECASE,
+ )
+ self._content += read_from_file
+ self._documents.append(read_from_file)
+ self.titles.append(file_name)
+ if file_name.endswith(".pdf"):
+ with open(os.path.join(input, file_name), "rb") as f:
+ reader = PdfReader(f)
+ read_from_file = ""
+ for page in reader.pages:
+ read_from_file += page.extract_text()
+ # remove comma separated ignore words
+ if comma_separated_ignore_words:
+ for word in comma_separated_ignore_words.split(","):
+ read_from_file = re.sub(
+ r"\b" + word.strip() + r"\b", "", read_from_file, flags=re.IGNORECASE,
+ )
+ self._content += read_from_file
+ self._documents.append(read_from_file)
+ self.titles.append(file_name)
+ # if input is a url
+ elif input.startswith("http://") or input.startswith("https://"):
+ response = requests.get(input)
+ if response.status_code == 200:
+ read_from_file = response.text
+ # remove comma separated ignore words
+ if comma_separated_ignore_words:
+ for word in comma_separated_ignore_words.split(","):
+ read_from_file = re.sub(
+ r"\b" + word.strip() + r"\b",
+ "",
+ read_from_file,
+ flags=re.IGNORECASE,
+ )
+ self._content = read_from_file
+ self._documents.append(read_from_file)
+ self.titles.append(input)
+ else:
+ raise ValueError("Input must be a file name, folder name or url.")
- doc_dict = {}
- ct3 = 0
- for t in self._titles:
- doc = doc_dict.get(t)
- if doc:
- doc_dict[t] = doc + self._documents[ct3]
- else:
- doc_dict[t] = self._documents[ct3]
- ct3 += 1
- self._titles.clear()
- self._documents.clear()
- for t in doc_dict.keys():
- self._documents.append(doc_dict.get(t))
- self._titles.append(t)
+ """
+ Combine duplicate topics using Dict
+ """
- f.close()
+ doc_dict = {}
+ ct3 = 0
+ for t in self._titles:
+ doc = doc_dict.get(t)
+ if doc:
+ doc_dict[t] = doc + self._documents[ct3]
+ else:
+ doc_dict[t] = self._documents[ct3]
+ ct3 += 1
+ self._titles.clear()
+ self._documents.clear()
+ for t in doc_dict.keys():
+ self._documents.append(doc_dict.get(t))
+ self._titles.append(t)
diff --git a/src/qrmine/resources/df_dominant_topic.csv b/src/qrmine/resources/df_dominant_topic.csv
new file mode 100644
index 0000000..115eb63
--- /dev/null
+++ b/src/qrmine/resources/df_dominant_topic.csv
@@ -0,0 +1,12 @@
+,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
+0,0,4,0.9903,"., GT, Strauss, ,, coding,
+, ), Theory, seminal, (","['ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']"
+1,1,1,0.7811,",, theory, ., GT, evaluation, structure, coding,
+, ), (","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']"
+2,2,1,0.9783,",, theory, ., GT, evaluation, structure, coding,
+, ), (","['\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n']"
+3,3,3,0.9952,"., ,, coding, category, open, QRMine, datum, researcher, code, GT","['\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n']"
+4,4,4,0.9793,"., GT, Strauss, ,, coding,
+, ), Theory, seminal, (","['\n', 'ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n']"
+5,5,2,0.9712,"category, comparison, incident, ,,
+, involve, refine, identify, emergence, constant","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n']"
diff --git a/src/qrmine/utils.py b/src/qrmine/utils.py
new file mode 100644
index 0000000..4d6776f
--- /dev/null
+++ b/src/qrmine/utils.py
@@ -0,0 +1,40 @@
+import re
+import requests
+import os
+
+
+class QRUtils(object):
+ def __init__(self):
+ pass
+
+ @staticmethod
+ def read_covid_narratives(output_folder):
+ os.makedirs(output_folder, exist_ok=True)
+ for doc_count in range(1, 115):
+ url = f"https://covidstories.omeka.net/items/show/{doc_count}"
+ html = requests.get(url).text
+ # Extract '
+ # find first match
+ match = re.search(pattern, html)
+ if match:
+ # Extract the URL
+ file_url = match.group(1)
+ # sanitize the URL
+ file_url = file_url.replace("&", "&")
+ print(f"Downloading file from {file_url}")
+ # Download the file
+ response = requests.get(file_url)
+ # Save the file to the output folder
+ with open(
+ os.path.join(output_folder, f"doc_{doc_count}.pdf"), "wb"
+ ) as f:
+ f.write(response.content)
+ else:
+ print(f"No match found for document {doc_count}")
+
+
+if __name__ == "__main__":
+ # Example usage
+ qr_utils = QRUtils()
+ qr_utils.read_covid_narratives("/tmp/covid_narratives")
diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
new file mode 100644
index 0000000..729d30b
--- /dev/null
+++ b/src/qrmine/visualize.py
@@ -0,0 +1,450 @@
+"""
+Copyright (C) 2025 Bell Eapen
+
+This file is part of qrmine.
+
+qrmine is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+qrmine is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with qrmine. If not, see .
+"""
+
+from collections import Counter
+
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib.patches import Rectangle
+from matplotlib.ticker import FuncFormatter
+from sklearn.manifold import TSNE
+from wordcloud import STOPWORDS, WordCloud
+
+
+class QRVisualize:
+ def __init__(self, data: pd.DataFrame = None):
+ """
+ Initialize the QRVisualize class with a DataFrame.
+
+ Parameters:
+ data (pd.DataFrame): The DataFrame containing the data to visualize.
+ """
+ self.data = data
+
+ def plot_frequency_distribution_of_words(self, df=None, folder_path=None):
+ if df is None:
+ df = self.data
+ doc_lens = [len(d) for d in df.Text]
+
+ # Plot
+ plt.figure(figsize=(16, 7), dpi=160)
+ plt.hist(doc_lens, bins=1000, color="navy")
+ plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens))))
+ plt.text(750, 90, "Median : " + str(round(np.median(doc_lens))))
+ plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens))))
+ plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01))))
+ plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99))))
+
+ plt.gca().set(
+ xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count"
+ )
+ plt.tick_params(size=16)
+ plt.xticks(np.linspace(0, 1000, 9))
+ plt.title("Distribution of Document Word Counts", fontdict=dict(size=22))
+ plt.show(block=False)
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def plot_distribution_by_topic(self, df=None, folder_path=None):
+ if df is None:
+ df = self.data
+ # Plot
+ cols = [
+ color for name, color in mcolors.TABLEAU_COLORS.items()
+ ] # more colors: 'mcolors.XKCD_COLORS'
+
+ fig, axes = plt.subplots(
+ 2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True
+ )
+
+ for i, ax in enumerate(axes.flatten()):
+ df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :]
+ doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
+ ax.hist(doc_lens, bins=1000, color=cols[i])
+ ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i])
+ sns.kdeplot(
+ doc_lens, color="black", fill=False, ax=ax.twinx(), warn_singular=False
+ )
+ ax.set(xlim=(0, 1000), xlabel="Document Word Count")
+ ax.set_ylabel("Number of Documents", color=cols[i])
+ ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i]))
+
+ fig.tight_layout()
+ fig.subplots_adjust(top=0.90)
+ plt.xticks(np.linspace(0, 1000, 9))
+ fig.suptitle(
+ "Distribution of Document Word Counts by Dominant Topic", fontsize=22
+ )
+ plt.show(block=False)
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def plot_wordcloud(self, topics=None, folder_path=None):
+ cols = [
+ color for name, color in mcolors.TABLEAU_COLORS.items()
+ ] # more colors: 'mcolors.XKCD_COLORS'
+
+ cloud = WordCloud(
+ stopwords=STOPWORDS,
+ background_color="white",
+ width=250,
+ height=180,
+ max_words=5,
+ colormap="tab10",
+ color_func=lambda *args, **kwargs: cols[i],
+ prefer_horizontal=1.0,
+ )
+
+ fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)
+
+ for i, ax in enumerate(axes.flatten()):
+ fig.add_subplot(ax)
+ topic_words = dict(topics[i][1])
+ cloud.generate_from_frequencies(topic_words, max_font_size=300)
+ plt.gca().imshow(cloud)
+ plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16))
+ plt.gca().axis("off")
+
+ plt.subplots_adjust(wspace=0, hspace=0)
+ plt.axis("off")
+ plt.margins(x=0, y=0)
+ plt.tight_layout()
+ plt.show(block=False)
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def plot_importance(self, topics=None, processed_docs=None, folder_path=None):
+ data_flat = [w for w_list in processed_docs for w in w_list]
+ counter = Counter(data_flat)
+
+ out = []
+ for i, topic in topics:
+ for word, weight in topic:
+ out.append([word, i, weight, counter[word]])
+
+ df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"])
+
+ # Plot Word Count and Weights of Topic Keywords
+ fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160)
+ cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
+ for i, ax in enumerate(axes.flatten()):
+ ax.bar(
+ x="word",
+ height="word_count",
+ data=df.loc[df.topic_id == i, :],
+ color=cols[i],
+ width=0.5,
+ alpha=0.3,
+ label="Word Count",
+ )
+ ax_twin = ax.twinx()
+ ax_twin.bar(
+ x="word",
+ height="importance",
+ data=df.loc[df.topic_id == i, :],
+ color=cols[i],
+ width=0.2,
+ label="Weights",
+ )
+ ax.set_ylabel("Word Count", color=cols[i])
+ ax_twin.set_ylim(0, 0.030)
+ ax.set_ylim(0, 3500)
+ ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16)
+ ax.tick_params(axis="y", left=False)
+ ax.set_xticklabels(
+ df.loc[df.topic_id == i, "word"],
+ rotation=30,
+ horizontalalignment="right",
+ )
+ ax.legend(loc="upper left")
+ ax_twin.legend(loc="upper right")
+
+ fig.tight_layout(w_pad=2)
+ fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05)
+ plt.show(block=False)
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13, folder_path=None):
+ if lda_model is None:
+ raise ValueError("LDA model is not provided.")
+ corp = corpus[start:end]
+ mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()]
+
+ fig, axes = plt.subplots(
+ end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160
+ )
+ axes[0].axis("off")
+ for i, ax in enumerate(axes):
+ try:
+ if i > 0:
+ corp_cur = corp[i - 1]
+ topic_percs, wordid_topics, _ = lda_model[corp_cur]
+ word_dominanttopic = [
+ (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics
+ ]
+ ax.text(
+ 0.01,
+ 0.5,
+ "Doc " + str(i - 1) + ": ",
+ verticalalignment="center",
+ fontsize=16,
+ color="black",
+ transform=ax.transAxes,
+ fontweight=700,
+ )
+
+ # Draw Rectange
+ topic_percs_sorted = sorted(
+ topic_percs, key=lambda x: (x[1]), reverse=True
+ )
+ ax.add_patch(
+ Rectangle(
+ (0.0, 0.05),
+ 0.99,
+ 0.90,
+ fill=None,
+ alpha=1,
+ color=mycolors[topic_percs_sorted[0][0]],
+ linewidth=2,
+ )
+ )
+
+ word_pos = 0.06
+ for j, (word, topics) in enumerate(word_dominanttopic):
+ if j < 14:
+ ax.text(
+ word_pos,
+ 0.5,
+ word,
+ horizontalalignment="left",
+ verticalalignment="center",
+ fontsize=16,
+ color=mycolors[topics],
+ transform=ax.transAxes,
+ fontweight=700,
+ )
+ word_pos += 0.009 * len(
+ word
+ ) # to move the word for the next iter
+ ax.axis("off")
+ ax.text(
+ word_pos,
+ 0.5,
+ ". . .",
+ horizontalalignment="left",
+ verticalalignment="center",
+ fontsize=16,
+ color="black",
+ transform=ax.transAxes,
+ )
+ except:
+ continue
+
+ plt.subplots_adjust(wspace=0, hspace=0)
+ plt.suptitle(
+ "Sentence Topic Coloring for Documents: "
+ + str(start)
+ + " to "
+ + str(end - 2),
+ fontsize=22,
+ y=0.95,
+ fontweight=700,
+ )
+ plt.tight_layout()
+ plt.show(block=False)
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def _cluster_chart(self, lda_model=None, corpus=None, n_topics=3, folder_path=None):
+ # Get topic weights
+ topic_weights = []
+ for i, row_list in enumerate(lda_model[corpus]):
+ topic_weights.append([w for i, w in row_list[0]])
+
+ # Array of topic weights
+ arr = pd.DataFrame(topic_weights).fillna(0).values
+
+ # Keep the well separated points (optional)
+ arr = arr[np.amax(arr, axis=1) > 0.35]
+
+ # Dominant topic number in each doc
+ topic_num = np.argmax(arr, axis=1)
+
+ # tSNE Dimension Reduction
+ tsne_model = TSNE(
+ n_components=2, verbose=1, random_state=0, angle=0.99, init="pca"
+ )
+ tsne_lda = tsne_model.fit_transform(arr)
+
+ # Plot
+ plt.figure(figsize=(16, 10), dpi=160)
+ for i in range(n_topics):
+ plt.scatter(
+ tsne_lda[topic_num == i, 0],
+ tsne_lda[topic_num == i, 1],
+ label=str(i),
+ alpha=0.5,
+ )
+ plt.title("t-SNE Clustering of Topics", fontsize=22)
+ plt.xlabel("t-SNE Dimension 1", fontsize=16)
+ plt.ylabel("t-SNE Dimension 2", fontsize=16)
+ plt.legend(title="Topic Number", loc="upper right")
+ plt.show(block=False)
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def most_discussed_topics(
+ self, lda_model, dominant_topics, topic_percentages, folder_path=None
+ ):
+
+ # Distribution of Dominant Topics in Each Document
+ df = pd.DataFrame(dominant_topics, columns=["Document_Id", "Dominant_Topic"])
+ dominant_topic_in_each_doc = df.groupby("Dominant_Topic").size()
+ df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(
+ name="count"
+ ).reset_index()
+
+ # Total Topic Distribution by actual weight
+ topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
+ df_topic_weightage_by_doc = (
+ topic_weightage_by_doc.sum().to_frame(name="count").reset_index()
+ )
+
+ # Top 3 Keywords for each Topic
+ topic_top3words = [
+ (i, topic)
+ for i, topics in lda_model.show_topics(formatted=False)
+ for j, (topic, wt) in enumerate(topics)
+ if j < 3
+ ]
+
+ df_top3words_stacked = pd.DataFrame(
+ topic_top3words, columns=["topic_id", "words"]
+ )
+ df_top3words = df_top3words_stacked.groupby("topic_id").agg(", \n".join)
+ df_top3words.reset_index(level=0, inplace=True)
+
+ # Plot
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True)
+
+ # Topic Distribution by Dominant Topics
+ ax1.bar(
+ x="Dominant_Topic",
+ height="count",
+ data=df_dominant_topic_in_each_doc,
+ width=0.5,
+ color="firebrick",
+ )
+ ax1.set_xticks(
+ range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__())
+ )
+ tick_formatter = FuncFormatter(
+ lambda x, pos: "Topic "
+ + str(x)
+ + "\n"
+ + df_top3words.loc[df_top3words.topic_id == x, "words"].values[0]
+ )
+ ax1.xaxis.set_major_formatter(tick_formatter)
+ ax1.set_title("Number of Documents by Dominant Topic", fontdict=dict(size=10))
+ ax1.set_ylabel("Number of Documents")
+ ax1.set_ylim(0, 1000)
+
+ # Topic Distribution by Topic Weights
+ ax2.bar(
+ x="index",
+ height="count",
+ data=df_topic_weightage_by_doc,
+ width=0.5,
+ color="steelblue",
+ )
+ ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
+ ax2.xaxis.set_major_formatter(tick_formatter)
+ ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10))
+
+ plt.show(block=False)
+
+ # save
+ if folder_path:
+ plt.savefig(folder_path)
+ plt.close()
+
+ def update_annot(self, ind):
+ norm = plt.Normalize(1,4)
+ cmap = plt.cm.RdYlGn
+ pos = self.sc.get_offsets()[ind["ind"][0]]
+ self.annot.xy = pos
+ text = "{}, {}".format(
+ " ".join(list(map(str, ind["ind"]))), " ".join([self.names[n] for n in ind["ind"]])
+ )
+ self.annot.set_text(text)
+ self.annot.get_bbox_patch().set_facecolor(cmap(norm(c[ind["ind"][0]])))
+ self.annot.get_bbox_patch().set_alpha(0.4)
+
+ def hover(self, event):
+ vis = self.annot.get_visible()
+ if event.inaxes == self.ax:
+ cont, ind = self.sc.contains(event)
+ if cont:
+ self.update_annot(ind)
+ self.annot.set_visible(True)
+ self.fig.canvas.draw_idle()
+ else:
+ if vis:
+ self.annot.set_visible(False)
+ self.fig.canvas.draw_idle()
+
+ # https://stackoverflow.com/questions/7908636/how-to-add-hovering-annotations-to-a-plot
+ def cluster_chart (self, data, folder_path=None):
+ # Scatter plot for Text Cluster Prediction
+ plt.figure(figsize=(6, 6))
+ self.fig, self.ax = plt.subplots()
+ self.names = data['title']
+ self.sc = plt.scatter(data['x'], data['y'], c=data['colour'], s=36, edgecolors='black', linewidths=0.75)
+ self.annot = self.ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points",
+ bbox=dict(boxstyle="round", fc="w"),
+ arrowprops=dict(arrowstyle="->"))
+ self.annot.set_visible(False)
+ plt.title('Text Cluster Prediction')
+ plt.axis('off') # Optional: Remove axes for a cleaner look
+ plt.colorbar(self.sc, label='Colour') # Add colorbar if needed
+ self.fig.canvas.mpl_connect("motion_notify_event", self.hover)
+ plt.show(block=False)
+ # save
+ if folder_path:
+ # annotate with data['title']
+ for i, txt in enumerate(data['title']):
+ plt.annotate(txt, (data['x'][i], data['y'][i]), fontsize=8, ha='right', va='bottom')
+ plt.savefig(folder_path)
+ plt.close()
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..a5c4b31
--- /dev/null
+++ b/test.py
@@ -0,0 +1,33 @@
+import spacy
+
+# Load spaCy model
+nlp = spacy.load("en_core_web_sm")
+
+# Sample documents
+documents = [
+ "Natural language processing is a field of AI.",
+ "Topic modeling helps in uncovering the main themes in a collection of documents.",
+ "Semantic clustering groups similar documents together based on meaning.",
+ "SpaCy is a popular NLP library.",
+ "Gensim is commonly used for topic modeling.",
+]
+
+
+# Preprocess the documents using spaCy
+def preprocess(doc):
+ # Tokenize and preprocess each document
+ doc = nlp(doc)
+ print(f"Original Document: {doc}")
+ # Lemmatize and remove stop words
+ tokens = [token.lemma_ for token in doc if not token.is_stop]
+ print(f"Processed Tokens: {tokens}")
+ return tokens
+
+
+# Apply preprocessing to each document
+processed_docs = [preprocess(doc) for doc in documents]
+
+
+# Print the processed documents
+for i, doc in enumerate(processed_docs):
+ print(f"Document {i + 1}: {doc}")
\ No newline at end of file
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 4ad331d..2c78676 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -1,32 +1,52 @@
import pytest
-
@pytest.fixture
def corpus_fixture():
from pkg_resources import resource_filename
from src.qrmine import ReadData
+
corpus = ReadData()
- file_path = resource_filename('src.qrmine.resources', 'interview.txt')
- corpus.read_file([file_path])
+ file_path = resource_filename("src.qrmine.resources", "interview.txt")
+ corpus.read_file(file_path)
return corpus
+@pytest.fixture
+def content():
+ from src.qrmine import Content
+
+ _content = Content()
+ return _content
+
+
# instannce of Qrmine as fixture
@pytest.fixture
def q():
from src.qrmine import Qrmine
+
_q = Qrmine()
return _q
+
+@pytest.fixture
+def cluster(content):
+ from src.qrmine import ClusterDocs
+
+ _cluster = ClusterDocs(content)
+ return _cluster
+
+
# Ref: https://docs.pytest.org/en/latest/capture.html
def test_generate_dict(corpus_fixture, capsys, q):
from src.qrmine import Content
+
num = 10
- all_interviews = Content(corpus_fixture.content)
+ all_interviews = Content(corpus_fixture.content, corpus_fixture.titles)
q.print_dict(all_interviews, num)
captured = capsys.readouterr()
print(captured.out)
- assert 'code' in captured.out
+ assert "code" in captured.out
+
def test_generate_topics(corpus_fixture, capsys, q):
q.content = corpus_fixture
@@ -34,22 +54,57 @@ def test_generate_topics(corpus_fixture, capsys, q):
q.print_topics()
captured = capsys.readouterr()
print(captured.out)
- assert 'TOPIC' in captured.out
+ assert "TOPIC" in captured.out
+
def test_category_basket(corpus_fixture, capsys, q):
q.content = corpus_fixture
print(q.category_basket())
captured = capsys.readouterr()
print(captured.out)
- assert 'theory' in captured.out
+ assert "theory" in captured.out
+
def test_category_association(corpus_fixture, capsys, q):
q.content = corpus_fixture
print(q.category_association())
captured = capsys.readouterr()
print(captured.out)
- assert 'theory' in captured.out
+ assert "theory" in captured.out
+
+
+def test_cluster_topics(corpus_fixture, capsys, cluster):
+ cluster.documents = corpus_fixture.documents
+ cluster.titles = corpus_fixture.titles
+
+ cluster.print_topics()
+ captured = capsys.readouterr()
+ print(captured.out)
+ assert "Topic" in captured.out
+ cluster.print_clusters()
+ captured = capsys.readouterr()
+ print(captured.out)
+ assert "Document" in captured.out
+ print("LDA Model")
+ print(cluster.build_lda_model())
+ print("LDA Model Topics")
+ print(cluster.topics_per_document())
+ # Format
+ df_dominant_topic = cluster.format_topics_sentences()
+ # Format the output
+ df_dominant_topic.columns = [
+ "Document_No",
+ "Dominant_Topic",
+ "Topic_Perc_Contrib",
+ "Keywords",
+ "Text",
+ ]
+ print(df_dominant_topic.head(10))
+ assert "Document_No" in df_dominant_topic.columns
+ df_sorted = cluster.most_representative_docs()
+ print(df_sorted.head(10))
+ assert "Dominant_Topic" in df_sorted.columns
diff --git a/tests/test_num.py b/tests/test_num.py
index f0c53cd..ac7a139 100644
--- a/tests/test_num.py
+++ b/tests/test_num.py
@@ -9,7 +9,7 @@ def ml_fixture():
ml = MLQRMine()
file_path = resource_filename('src.qrmine.resources', 'numeric.csv')
ml.csvfile = file_path
- return ml
+ return ml
@@ -19,7 +19,7 @@ def test_nn(ml_fixture, capsys):
ml_fixture.prepare_data(True)
ml_fixture.get_nnet_predictions()
captured = capsys.readouterr()
- assert 'accuracy' in captured.out
+ assert 'Accuracy' in captured.out
def test_svm(ml_fixture, capsys):
ml_fixture.prepare_data(True)
diff --git a/tests/test_readfiles.py b/tests/test_readfiles.py
index aff3a5d..963ed90 100644
--- a/tests/test_readfiles.py
+++ b/tests/test_readfiles.py
@@ -8,8 +8,8 @@ def corpus_fixture():
from src.qrmine import ReadData
corpus = ReadData()
file_path = resource_filename('src.qrmine.resources', 'interview.txt')
- corpus.read_file([file_path])
- return corpus
+ corpus.read_file(file_path)
+ return corpus
def test_content(corpus_fixture):
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
new file mode 100644
index 0000000..41f7145
--- /dev/null
+++ b/tests/test_visualize.py
@@ -0,0 +1,114 @@
+import pytest
+import pandas as pd
+from src.qrmine.visualize import QRVisualize
+
+
+@pytest.fixture
+def v():
+ from pkg_resources import resource_filename
+
+ file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv")
+ data = pd.read_csv(file_path)
+ _v = QRVisualize(data)
+ return _v
+
+
+@pytest.fixture
+def topics():
+ return [
+ (
+ 0,
+ [
+ (".", 0.095292516),
+ (",", 0.053392828),
+ ("category", 0.032462463),
+ ("coding", 0.032456465),
+ ("open", 0.032437164),
+ ("QRMine", 0.03243305),
+ ("datum", 0.021980358),
+ ("researcher", 0.021978099),
+ ("theory", 0.011536299),
+ ("GT", 0.011533132),
+ ],
+ ),
+ (
+ 1,
+ [
+ (".", 0.007783216),
+ (",", 0.007773952),
+ ("open", 0.007728422),
+ ("researcher", 0.0077227736),
+ ("coding", 0.007722049),
+ ("category", 0.007721938),
+ ("datum", 0.007717547),
+ ("QRMine", 0.007716193),
+ ("dissect", 0.0077070068),
+ ("support", 0.0077060354),
+ ],
+ ),
+ (
+ 2,
+ [
+ (",", 0.05126711),
+ (".", 0.05125151),
+ ("theory", 0.038604487),
+ ("category", 0.03227912),
+ ("GT", 0.032278605),
+ ("\n", 0.029119665),
+ ("comparison", 0.025947908),
+ ("coding", 0.025941858),
+ ("incident", 0.019622542),
+ (")", 0.019619444),
+ ],
+ ),
+ (
+ 3,
+ [
+ (".", 0.007849805),
+ (",", 0.007837688),
+ ("theory", 0.00781459),
+ ("coding", 0.0078089647),
+ ("category", 0.0077514737),
+ ("GT", 0.0077493717),
+ ("datum", 0.007742789),
+ ("open", 0.0077355755),
+ ("\n", 0.0077245855),
+ ("researcher", 0.0077191954),
+ ],
+ ),
+ (
+ 4,
+ [
+ (",", 0.007834569),
+ (".", 0.007812336),
+ ("coding", 0.0077863215),
+ ("category", 0.007759207),
+ ("theory", 0.0077459146),
+ ("GT", 0.0077370973),
+ ("code", 0.0077265715),
+ ("datum", 0.007720947),
+ ("open", 0.007720898),
+ ("comparison", 0.007720567),
+ ],
+ ),
+ ]
+
+
+def test_frequency_distribution_of_words(v, capsys):
+ v.plot_frequency_distribution_of_words(
+ v.data
+ )
+ captured = capsys.readouterr()
+ print(captured.out)
+
+
+def test_distribution_by_topic(v, capsys):
+ v.plot_distribution_by_topic(v.data)
+ captured = capsys.readouterr()
+ print(captured.out)
+
+
+def test_plot_wordcloud(v, topics, capsys):
+ v.plot_wordcloud(topics)
+ captured = capsys.readouterr()
+ print(captured.out)
diff --git a/tox.ini b/tox.ini
index 3eb707d..14055f6 100644
--- a/tox.ini
+++ b/tox.ini
@@ -3,26 +3,72 @@
# THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS!
[tox]
-minversion = 2.4
-envlist = py311, integration
+minversion = 3.15
+envlist = default, clean, build, docs, doctests
+
[testenv]
-setenv = TOXINIDIR = {toxinidir}
+description = invoke pytest to run automated tests
+isolated_build = True
+setenv =
+ TOXINIDIR = {toxinidir}
+passenv =
+ HOME
+extras =
+ testing,ml
+commands =
+ python -m spacy download en_core_web_sm
+ pytest {posargs}
+
+
+[testenv:{clean,build}]
+description =
+ Build (or clean) the package in isolation according to instructions in:
+ https://setuptools.readthedocs.io/en/latest/build_meta.html#how-to-use-it
+ https://github.com/pypa/pep517/issues/91
+ https://github.com/pypa/build
+# NOTE: build is still experimental, please refer to the links for updates/issues
+skip_install = True
+changedir = {toxinidir}
+commands =
+ clean: python -c 'from shutil import rmtree; rmtree("build", True); rmtree("dist", True)'
+ build: python setup.py sdist
+# By default `build` produces wheels, you can also explicitly use the flags `--sdist` and `--wheel`
+
+
+[testenv:{docs,doctests}]
+description = invoke sphinx-build to build the docs/run doctests
+setenv =
+ DOCSDIR = {toxinidir}/docs
+ BUILDDIR = {toxinidir}/docs/_build
+ docs: BUILD = html
+ doctests: BUILD = doctest
deps =
- -rrequirements.txt
- -rdev-requirements.txt
+ -r {toxinidir}/docs/requirements.txt
+ # ^ requirements.txt shared with Read The Docs
commands =
python -m spacy download en_core_web_sm
- py.test {posargs}
-extras =
- all
- testing
+ sphinx-build -b {env:BUILD} -d "{env:BUILDDIR}/doctrees" "{env:DOCSDIR}" "{env:BUILDDIR}/{env:BUILD}" {posargs}
+
+
+[testenv:publish]
+description =
+ Publish the package you have been developing to a package index server.
+ By default, it uses testpypi. If you really want to publish your package
+ to be publicly accessible in PyPI, use the `-- --repository pypi` option.
+skip_install = True
+changedir = {toxinidir}
+passenv =
+ TWINE_USERNAME
+ TWINE_PASSWORD
+ TWINE_REPOSITORY
+deps = twine
+commands =
+ python -m twine check dist/*
+ python -m twine upload {posargs:--repository testpypi} dist/*
[testenv:integration]
setenv = TOXINIDIR = {toxinidir}
-deps =
- -rrequirements.txt
- -rdev-requirements.txt
commands =
python -m spacy download en_core_web_sm
python qrminer.py
\ No newline at end of file