From d907d5315204818bb41e28b6abf0e14df0597228 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 15:20:40 +0000
Subject: [PATCH 01/35] chore: update .gitignore and add conda setup
 instructions

---
 .gitignore     |   1 +
 notes/conda.md |   8 ++++
 pyproject.toml | 104 ++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 107 insertions(+), 6 deletions(-)
 create mode 100644 notes/conda.md

diff --git a/.gitignore b/.gitignore
index 64049e7..c29a2a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ __pycache__/*
 .idea
 .venv
 conda
+uv.lock
 
 # Package files
 *.egg
diff --git a/notes/conda.md b/notes/conda.md
new file mode 100644
index 0000000..49d024a
--- /dev/null
+++ b/notes/conda.md
@@ -0,0 +1,8 @@
+conda create --name qrmine python=3.11
+conda activate qrmine
+
+conda install conda-forge::uv
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
+uv pip install pandas matplotlib click scikit-learn imbalanced-learn vaderSentiment xgboost mlxtend spacy textacy tensorflow==2.13.1 tensorflow-io-gcs-filesystem==0.31.0 pytest tox
+python -m spacy download en_core_web_sm
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 89a5bed..dcf5188 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,101 @@
 [build-system]
-# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD!
-requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"]
+requires = ["setuptools>=61.2"]
 build-backend = "setuptools.build_meta"
 
-[tool.setuptools_scm]
-# For smarter version schemes and other configuration options,
-# check out https://github.com/pypa/setuptools_scm
-version_scheme = "no-guess-dev"
+[project]
+name = "qrmine"
+description = "Qualitative Research support tools in Python!"
+authors = [{name = "beapen", email = "github@gulfdoctor.net"}]
+license = {text = "GPL-3.0-only"}
+classifiers = [
+    "Intended Audience :: Science/Research",
+    "Development Status :: 4 - Beta",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+]
+dependencies = [
+    'importlib-metadata; python_version<"3.8"',
+    "pandas",
+    "matplotlib",
+    "click",
+    "scikit-learn",
+    "imbalanced-learn",
+    "vadersentiment",
+    "xgboost",
+    "mlxtend",
+    "spacy",
+    "textacy",
+    "tensorflow==2.13.1",
+    "tensorflow-io-gcs-filesystem==0.31.0",
+    "pytest>=8.3.5",
+    "tox>=4.25.0",
+]
+dynamic = ["version"]
+
+[project.readme]
+file = "README.md"
+content-type = "text/markdown"
+
+[project.urls]
+Homepage = "https://github.com/dermatologist/nlp-qrmine"
+Documentation = "https://arxiv.org/abs/2003.13519"
+
+[project.optional-dependencies]
+testing = [
+    "setuptools",
+    "pytest",
+    "pytest-cov",
+]
+
+[project.scripts]
+qrmine = "qrmine.main:main_routine"
+
+[tool.setuptools]
+zip-safe = false
+include-package-data = true
+package-dir = {"" = "src"}
+platforms = ["any"]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+exclude = ["tests"]
+namespaces = true
+
+[tool.pytest.ini_options]
+addopts = """
+--verbose"""
+norecursedirs = [
+    "dist",
+    "build",
+    ".tox",
+]
+
+[tool.aliases]
+release = "sdist bdist_wheel upload"
+
+[tool.distutils.bdist_wheel]
+universal = 1
+
+[tool.build_sphinx]
+source_dir = "docs"
+build_dir = "docs/_build"
+testpaths = "tests"
+
+[tool.devpi.upload]
+no_vcs = "1"
+formats = "bdist_wheel"
+
+[tool.flake8]
+max_line_length = "88"
+extend_ignore = "E203, W503"
+exclude = """
+.tox
+build
+dist
+.eggs
+docs/conf.py"""
+
+[tool.pyscaffold]
+version = "4.6"
+package = "qrmine"

From 3238e5d9c43cfef7f64b12411796fd39f8b5c85d Mon Sep 17 00:00:00 2001
From: dermatologist <github@gulfdoctor.net>
Date: Wed, 30 Apr 2025 11:02:05 -0500
Subject: [PATCH 02/35] feat: implement neural network model using PyTorch and
 update tests for accuracy output

---
 notes/conda.md         |  6 ++-
 setup.cfg              |  3 +-
 src/qrmine/mlqrmine.py | 88 +++++++++++++++++++++++++++++++++---------
 tests/test_num.py      |  4 +-
 4 files changed, 77 insertions(+), 24 deletions(-)

diff --git a/notes/conda.md b/notes/conda.md
index 49d024a..c0af730 100644
--- a/notes/conda.md
+++ b/notes/conda.md
@@ -5,4 +5,8 @@ conda install conda-forge::uv
 uv pip install ini2toml
 ini2toml setup.cfg -o pyproject.toml
 uv pip install pandas matplotlib click scikit-learn imbalanced-learn vaderSentiment xgboost mlxtend spacy textacy tensorflow==2.13.1 tensorflow-io-gcs-filesystem==0.31.0 pytest tox
-python -m spacy download en_core_web_sm
\ No newline at end of file
+python -m spacy download en_core_web_sm
+
+
+
+pip3 install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index e6953b9..7655d02 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -64,8 +64,7 @@ install_requires =
     mlxtend
     spacy
     textacy
-    tensorflow<=2.13.1
-    tensorflow-io-gcs-filesystem<=0.31.0
+    torch
 
 [options.packages.find]
 where = src
diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py
index 12b75a3..fcfac7a 100644
--- a/src/qrmine/mlqrmine.py
+++ b/src/qrmine/mlqrmine.py
@@ -1,13 +1,10 @@
 import numpy
 from imblearn.over_sampling import RandomOverSampler
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense
-from numpy import random, argsort, sqrt, array, ones
 from pandas import read_csv
 from sklearn.cluster import KMeans
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 from sklearn.neighbors import KDTree
@@ -17,6 +14,25 @@
 from mlxtend.frequent_patterns import apriori
 from mlxtend.frequent_patterns import association_rules
 
+import torch.nn as nn
+import torch.optim as optim
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+class NeuralNet(nn.Module):
+    def __init__(self, input_dim):
+        super(NeuralNet, self).__init__()
+        self.fc1 = nn.Linear(input_dim, 12)
+        self.fc2 = nn.Linear(12, 8)
+        self.fc3 = nn.Linear(8, 1)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        x = self.relu(self.fc1(x))
+        x = self.relu(self.fc2(x))
+        x = self.sigmoid(self.fc3(x))
+        return x
+
 
 class MLQRMine(object):
 
@@ -24,13 +40,13 @@ def __init__(self):
         self._seed = randint(1, 9)
         self._csvfile = ""
         self._titles = None
+        self._model = None
         self._dataset = None
         self._X = None
         self._y = None
         self._X_original = None
         self._y_original = None
         self._dataset_original = None
-        self._model = Sequential()
         self._sc = StandardScaler()
         self._vnum = 0  # Number of variables
         self._classifier = XGBClassifier()
@@ -147,22 +163,57 @@ def prepare_data(self, oversample=False):
             self.oversample()
 
     def get_nnet_predictions(self):
-        self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu'))
-        self._model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
-        self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
-        # Compile model
-        self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
-        # Fit the model
-        self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2)
-
-        # calculate predictions
-        predictions = self._model.predict(self._X_original)
-        # round predictions
-        rounded = [round(x[0]) for x in predictions]
+
+        self._model = NeuralNet(self._vnum)
+        criterion = nn.BCELoss()
+        optimizer = optim.Adam(self._model.parameters(), lr=0.001)
+
+        # Convert data to PyTorch tensors
+        X_tensor = torch.tensor(self._X, dtype=torch.float32)
+        y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1)
+
+        # Create a dataset and data loader
+        dataset = TensorDataset(X_tensor, y_tensor)
+        dataloader = DataLoader(dataset, batch_size=10, shuffle=True)
+
+        # Train the model
+        for epoch in range(self._epochs):
+            for batch_X, batch_y in dataloader:
+                optimizer.zero_grad()
+                outputs = self._model(batch_X)
+                loss = criterion(outputs, batch_y)
+                loss.backward()
+                optimizer.step()
+
+        # Calculate predictions
+        with torch.no_grad():
+            predictions = self._model(torch.tensor(self._X_original, dtype=torch.float32))
+            rounded = [round(x.item()) for x in predictions]
+        # print("Predictions: ", rounded)
+        # Calculate accuracy
+        correct = sum([1 for i in range(len(rounded)) if rounded[i] == self._y_original[i]])
+        total = len(rounded)
+        accuracy = correct / total
+        print(f'Accuracy: {accuracy * 100:.2f}%')
         return rounded
 
     def get_nnet_scores(self):
-        return self._model.evaluate(self._X, self._y)
+        # evalute the pytorch model
+        self._model.eval()
+        X_tensor = torch.tensor(self._X, dtype=torch.float32)
+        y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1)
+        dataset = TensorDataset(X_tensor, y_tensor)
+        dataloader = DataLoader(dataset, batch_size=10, shuffle=True)
+        correct = 0
+        total = 0
+        with torch.no_grad():
+            for batch_X, batch_y in dataloader:
+                outputs = self._model(batch_X)
+                predicted = (outputs > 0.5).float()
+                total += batch_y.size(0)
+                correct += (predicted == batch_y).sum().item()
+        accuracy = correct / total
+        print(f'Accuracy: {accuracy * 100:.2f}%')
 
     def svm_confusion_matrix(self):
         """Generate confusion matrix for SVM
@@ -211,7 +262,6 @@ def get_centroids(self, c=1):
             print("Mean")
             print(self._dataset.iloc[cluster_list, :].mean(axis=0))
 
-
     """
     TODO: This is not working yet.
     use the ColumnTransformer instead of categorical_features
diff --git a/tests/test_num.py b/tests/test_num.py
index f0c53cd..ac7a139 100644
--- a/tests/test_num.py
+++ b/tests/test_num.py
@@ -9,7 +9,7 @@ def ml_fixture():
     ml = MLQRMine()
     file_path = resource_filename('src.qrmine.resources', 'numeric.csv')
     ml.csvfile = file_path
-    return ml 
+    return ml
 
 
 
@@ -19,7 +19,7 @@ def test_nn(ml_fixture, capsys):
     ml_fixture.prepare_data(True)
     ml_fixture.get_nnet_predictions()
     captured = capsys.readouterr()
-    assert 'accuracy' in captured.out
+    assert 'Accuracy' in captured.out
 
 def test_svm(ml_fixture, capsys):
     ml_fixture.prepare_data(True)

From 7ea669e14bf21c0c433db21f2628aa541214e883 Mon Sep 17 00:00:00 2001
From: dermatologist <github@gulfdoctor.net>
Date: Wed, 30 Apr 2025 11:09:05 -0500
Subject: [PATCH 03/35] chore: update pyproject.toml and setup.cfg to organize
 dependencies and improve structure

---
 pyproject.toml | 103 ++++++++++++++++++-------------------------------
 setup.cfg      |   5 ++-
 2 files changed, 42 insertions(+), 66 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index dcf5188..f80f403 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,37 +1,20 @@
 [build-system]
-requires = ["setuptools>=61.2"]
+requires = [ "setuptools>=61.2",]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "qrmine"
 description = "Qualitative Research support tools in Python!"
-authors = [{name = "beapen", email = "github@gulfdoctor.net"}]
-license = {text = "GPL-3.0-only"}
-classifiers = [
-    "Intended Audience :: Science/Research",
-    "Development Status :: 4 - Beta",
-    "Operating System :: OS Independent",
-    "Programming Language :: Python :: 3.11",
-    "Topic :: Scientific/Engineering :: Information Analysis",
-]
-dependencies = [
-    'importlib-metadata; python_version<"3.8"',
-    "pandas",
-    "matplotlib",
-    "click",
-    "scikit-learn",
-    "imbalanced-learn",
-    "vadersentiment",
-    "xgboost",
-    "mlxtend",
-    "spacy",
-    "textacy",
-    "tensorflow==2.13.1",
-    "tensorflow-io-gcs-filesystem==0.31.0",
-    "pytest>=8.3.5",
-    "tox>=4.25.0",
-]
-dynamic = ["version"]
+classifiers = [ "Intended Audience :: Science/Research", "Development Status :: 4 - Beta", "Operating System :: OS Independent", "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Information Analysis",]
+dependencies = [ "importlib-metadata; python_version<\"3.8\"", "pandas", "matplotlib", "click", "scikit-learn", "imbalanced-learn", "vaderSentiment", "xgboost", "mlxtend", "spacy", "textacy", "torch",]
+dynamic = [ "version",]
+
+[[project.authors]]
+name = "beapen"
+email = "github@gulfdoctor.net"
+
+[project.license]
+text = "GPL-3.0-only"
 
 [project.readme]
 file = "README.md"
@@ -42,11 +25,9 @@ Homepage = "https://github.com/dermatologist/nlp-qrmine"
 Documentation = "https://arxiv.org/abs/2003.13519"
 
 [project.optional-dependencies]
-testing = [
-    "setuptools",
-    "pytest",
-    "pytest-cov",
-]
+gpu = [ "torch[gpu]==2.1.1",]
+cpu = [ "torch==2.1.1",]
+testing = [ "setuptools", "pytest", "pytest-cov",]
 
 [project.scripts]
 qrmine = "qrmine.main:main_routine"
@@ -54,48 +35,40 @@ qrmine = "qrmine.main:main_routine"
 [tool.setuptools]
 zip-safe = false
 include-package-data = true
-package-dir = {"" = "src"}
-platforms = ["any"]
-
-[tool.setuptools.packages.find]
-where = ["src"]
-exclude = ["tests"]
-namespaces = true
-
-[tool.pytest.ini_options]
-addopts = """
---verbose"""
-norecursedirs = [
-    "dist",
-    "build",
-    ".tox",
-]
+platforms = [ "any",]
 
 [tool.aliases]
 release = "sdist bdist_wheel upload"
 
+[tool.flake8]
+max_line_length = "88"
+extend_ignore = "E203, W503"
+exclude = "\n.tox\nbuild\ndist\n.eggs\ndocs/conf.py"
+
+[tool.pyscaffold]
+version = "4.6"
+package = "qrmine"
+
+[tool.setuptools.package-dir]
+"" = "src"
+
+[tool.pytest.ini_options]
+addopts = "\n--verbose"
+norecursedirs = [ "dist", "build", ".tox",]
+
 [tool.distutils.bdist_wheel]
 universal = 1
 
-[tool.build_sphinx]
-source_dir = "docs"
-build_dir = "docs/_build"
+[tool.distutils.build_sphinx]
+source-dir = "docs"
+build-dir = "docs/_build"
 testpaths = "tests"
 
 [tool.devpi.upload]
 no_vcs = "1"
 formats = "bdist_wheel"
 
-[tool.flake8]
-max_line_length = "88"
-extend_ignore = "E203, W503"
-exclude = """
-.tox
-build
-dist
-.eggs
-docs/conf.py"""
-
-[tool.pyscaffold]
-version = "4.6"
-package = "qrmine"
+[tool.setuptools.packages.find]
+where = [ "src",]
+exclude = [ "tests",]
+namespaces = true
diff --git a/setup.cfg b/setup.cfg
index 7655d02..dba42c2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -75,7 +75,10 @@ exclude =
 # Add here additional requirements for extra features, to install with:
 # `pip install qrmine[PDF]` like:
 # PDF = ReportLab; RXP
-
+    gpu =
+        torch[gpu]==2.1.1
+    cpu =
+        torch==2.1.1
 # Add here test requirements (semicolon/line-separated)
 testing =
     setuptools

From 0ada4d695c32811ba5219b22f70634710b293b5e Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 16:26:59 +0000
Subject: [PATCH 04/35] chore: update conda.md to simplify package installation
 command

---
 notes/conda.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notes/conda.md b/notes/conda.md
index c0af730..79eb6c8 100644
--- a/notes/conda.md
+++ b/notes/conda.md
@@ -4,7 +4,7 @@ conda activate qrmine
 conda install conda-forge::uv
 uv pip install ini2toml
 ini2toml setup.cfg -o pyproject.toml
-uv pip install pandas matplotlib click scikit-learn imbalanced-learn vaderSentiment xgboost mlxtend spacy textacy tensorflow==2.13.1 tensorflow-io-gcs-filesystem==0.31.0 pytest tox
+uv pip install -e .
 python -m spacy download en_core_web_sm
 
 

From 6f0fe38c05da30a1ec1f35d68136963ccee947f6 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 16:57:19 +0000
Subject: [PATCH 05/35] feat: enhance ReadData class to support reading from
 URLs and folders, update tests for consistency

---
 notes/pip-tools.md      |   5 +-
 pyproject.toml          | 159 ++++++++++++++++++++++++++++++----------
 setup.cfg               |   2 +
 src/qrmine/readfiles.py |  92 +++++++++++++----------
 tests/test_nlp.py       |   2 +-
 tests/test_readfiles.py |   4 +-
 6 files changed, 185 insertions(+), 79 deletions(-)

diff --git a/notes/pip-tools.md b/notes/pip-tools.md
index da4baa4..c504a1e 100644
--- a/notes/pip-tools.md
+++ b/notes/pip-tools.md
@@ -21,4 +21,7 @@ OR
 
 * pip install uv
 * uv pip compile setup.cfg -o requirements.txt --universal
-* uv pip compile dev-requirements.in -o dev-requirements.txt --universal
\ No newline at end of file
+* uv pip compile dev-requirements.in -o dev-requirements.txt --universal
+
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index f80f403..abba09a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,33 +1,81 @@
 [build-system]
-requires = [ "setuptools>=61.2",]
+requires = ["setuptools>=61.2"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "qrmine"
 description = "Qualitative Research support tools in Python!"
-classifiers = [ "Intended Audience :: Science/Research", "Development Status :: 4 - Beta", "Operating System :: OS Independent", "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Information Analysis",]
-dependencies = [ "importlib-metadata; python_version<\"3.8\"", "pandas", "matplotlib", "click", "scikit-learn", "imbalanced-learn", "vaderSentiment", "xgboost", "mlxtend", "spacy", "textacy", "torch",]
-dynamic = [ "version",]
-
-[[project.authors]]
-name = "beapen"
-email = "github@gulfdoctor.net"
-
-[project.license]
-text = "GPL-3.0-only"
+authors = [{name = "beapen", email = "github@gulfdoctor.net"}]
+license = {text = "GPL-3.0-only"}
+# license_files = LICENSE.txt
+# long_description = file: README.rst
+# long_description_content_type = text/x-rst; charset=UTF-8
+classifiers = [
+    "Intended Audience :: Science/Research",
+    "Development Status :: 4 - Beta",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+]
+dependencies = [
+    'importlib-metadata; python_version<"3.8"',
+    "pandas",
+    "matplotlib",
+    "click",
+    "scikit-learn",
+    "imbalanced-learn",
+    "vaderSentiment",
+    "xgboost",
+    "mlxtend",
+    "spacy",
+    "textacy",
+    "torch",
+    "pypdf",
+    "requests",
+]
+dynamic = ["version"]
 
 [project.readme]
 file = "README.md"
 content-type = "text/markdown"
+# Add here related links, for example:
 
 [project.urls]
 Homepage = "https://github.com/dermatologist/nlp-qrmine"
 Documentation = "https://arxiv.org/abs/2003.13519"
+# Source = https://github.com/pyscaffold/pyscaffold/
+# Changelog = https://pyscaffold.org/en/latest/changelog.html
+# Tracker = https://github.com/pyscaffold/pyscaffold/issues
+# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
+# Download = https://pypi.org/project/PyScaffold/#files
+# Twitter = https://twitter.com/PyScaffold
+# Change if running only on Windows, Mac or Linux (comma-separated)
+# Add here all kinds of additional classifiers as defined under
+# https://pypi.org/classifiers/
 
 [project.optional-dependencies]
-gpu = [ "torch[gpu]==2.1.1",]
-cpu = [ "torch==2.1.1",]
-testing = [ "setuptools", "pytest", "pytest-cov",]
+# Add here additional requirements for extra features, to install with:
+# `pip install qrmine[PDF]` like:
+# PDF = ReportLab; RXP
+gpu = ["torch[gpu]==2.1.1"]
+cpu = ["torch==2.1.1"]
+# Add here test requirements (semicolon/line-separated)
+testing = [
+    "setuptools",
+    "pytest",
+    "pytest-cov",
+]
+
+[project.entry-points]
+# Add here console scripts like:
+# console_scripts =
+# script_name = qrmine.module:function
+# For example:
+# console_scripts =
+# fibonacci = qrmine.skeleton:run
+# And any other entry points, for example:
+# pyscaffold.cli =
+# awesome = pyscaffoldext.awesome.extension:AwesomeExtension
 
 [project.scripts]
 qrmine = "qrmine.main:main_routine"
@@ -35,40 +83,75 @@ qrmine = "qrmine.main:main_routine"
 [tool.setuptools]
 zip-safe = false
 include-package-data = true
-platforms = [ "any",]
-
-[tool.aliases]
-release = "sdist bdist_wheel upload"
-
-[tool.flake8]
-max_line_length = "88"
-extend_ignore = "E203, W503"
-exclude = "\n.tox\nbuild\ndist\n.eggs\ndocs/conf.py"
+package-dir = {"" = "src"}
+# Require a min/specific Python version (comma-separated conditions)
+# python_requires = >=3.8
+# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
+# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
+# new major versions. This works if the required packages follow Semantic Versioning.
+# For more information, check out https://semver.org/.
+platforms = ["any"]
 
-[tool.pyscaffold]
-version = "4.6"
-package = "qrmine"
-
-[tool.setuptools.package-dir]
-"" = "src"
+[tool.setuptools.packages.find]
+where = ["src"]
+exclude = ["tests"]
+namespaces = true
 
 [tool.pytest.ini_options]
-addopts = "\n--verbose"
-norecursedirs = [ "dist", "build", ".tox",]
+# Specify command line options as you would do when invoking pytest directly.
+# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
+# in order to write a coverage file that can be read by Jenkins.
+# CAUTION: --cov flags may prohibit setting breakpoints while debugging.
+# Comment those flags to avoid this pytest issue.
+addopts = """
+--verbose"""
+norecursedirs = [
+    "dist",
+    "build",
+    ".tox",
+]
+
+[tool.aliases]
+release = "sdist bdist_wheel upload"
 
 [tool.distutils.bdist_wheel]
+# Use this option if your package is pure-python
 universal = 1
 
-[tool.distutils.build_sphinx]
-source-dir = "docs"
-build-dir = "docs/_build"
+[tool.build_sphinx]
+source_dir = "docs"
+build_dir = "docs/_build"
 testpaths = "tests"
+# Use pytest markers to select/deselect specific tests
+# markers =
+# slow: mark tests as slow (deselect with '-m "not slow"')
+# system: mark end-to-end system tests
 
 [tool.devpi.upload]
+# Options for the devpi: PyPI server and packaging tool
+# VCS export must be deactivated since we are using setuptools-scm
 no_vcs = "1"
 formats = "bdist_wheel"
 
-[tool.setuptools.packages.find]
-where = [ "src",]
-exclude = [ "tests",]
-namespaces = true
+[tool.flake8]
+# Some sane defaults for the code style checker flake8
+max_line_length = "88"
+extend_ignore = "E203, W503"
+# ^  Black-compatible
+# E203 and W503 have edge cases handled by black
+exclude = """
+.tox
+build
+dist
+.eggs
+docs/conf.py"""
+
+[tool.pyscaffold]
+# PyScaffold's parameters when the project was created.
+# This will be used when updating. Do not change!
+version = "4.6"
+package = "qrmine"
+# This file is used to configure your project.
+# Read more about the various options under:
+# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
+# https://setuptools.pypa.io/en/latest/references/keywords.html
diff --git a/setup.cfg b/setup.cfg
index dba42c2..82c523d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,6 +65,8 @@ install_requires =
     spacy
     textacy
     torch
+    pypdf
+    requests
 
 [options.packages.find]
 where = src
diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py
index a460795..522eaaa 100644
--- a/src/qrmine/readfiles.py
+++ b/src/qrmine/readfiles.py
@@ -1,5 +1,6 @@
 import re
-
+import requests
+from pypdf import PdfReader
 
 class ReadData(object):
     def __init__(self):
@@ -37,22 +38,10 @@ def append(self, title, document):
         self._documents.append(document)
         self._content += document
 
-    def read_file(self, file_names):
-        if len(file_names) > 1:
-            for file_name in file_names:
-                with open(file_name, 'r') as f:
-                    read_from_file = f.read()
-                    self._content = re.sub('<[^<]+?>', '', read_from_file)
-                    self._documents = re.split('<break>.*?</break>', read_from_file)
-                    # Delete the last blank record
-                    del self._documents[-1]
-                    pattern = r"<break>(.*?)</break>"
-                    _title = re.findall(pattern, read_from_file, flags=re.DOTALL)[0]
-                    self._titles.append(_title)
-                f.close()
-        else:
-            file_name = file_names[0]
-            with open(file_name, 'r') as f:
+    def read_file(self, input):
+        # if input is a file name
+        if isinstance(input, str):
+            with open(input, 'r') as f:
                 read_from_file = f.read()
                 self._content = re.sub('<[^<]+?>', '', read_from_file)
                 self._documents = re.split('<break>.*?</break>', read_from_file)
@@ -60,25 +49,54 @@ def read_file(self, file_names):
                 del self._documents[-1]
                 pattern = r"<break>(.*?)</break>"
                 self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL)
+        # if input is a folder name
+        elif isinstance(input, str):
+            import os
+            for file_name in os.listdir(input):
+                if file_name.endswith('.txt'):
+                    with open(os.path.join(input, file_name), 'r') as f:
+                        read_from_file = f.read()
+                        self._content += read_from_file
+                        self._documents.append(read_from_file)
+                        self.titles.append(file_name)
+                if file_name.endswith('.pdf'):
+                    with open(os.path.join(input, file_name), 'rb') as f:
+                        reader = PdfReader(f)
+                        read_from_file = ""
+                        for page in reader.pages:
+                            read_from_file += page.extract_text()
+                        self._content += read_from_file
+                        self._documents.append(read_from_file)
+                        self.titles.append(file_name)
+        # if input is a url
+        elif isinstance(input, str):
+            response = requests.get(input)
+            if response.status_code == 200:
+                read_from_file = response.text
+                self._content = re.sub('<[^<]+?>', '', read_from_file)
+                self._documents = re.split('<break>.*?</break>', read_from_file)
+                # Delete the last blank record
+                del self._documents[-1]
+                pattern = r"<break>(.*?)</break>"
+                self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL)
+        else:
+            raise ValueError("Input must be a file name, folder name or url.")
 
-                """
-                Combine duplicate topics using Dict
-                Currently supported only for single file.
-                """
-
-                doc_dict = {}
-                ct3 = 0
-                for t in self._titles:
-                    doc = doc_dict.get(t)
-                    if doc:
-                        doc_dict[t] = doc + self._documents[ct3]
-                    else:
-                        doc_dict[t] = self._documents[ct3]
-                    ct3 += 1
-                self._titles.clear()
-                self._documents.clear()
-                for t in doc_dict.keys():
-                    self._documents.append(doc_dict.get(t))
-                    self._titles.append(t)
+        """
+        Combine duplicate topics using Dict
+        """
 
-                f.close()
+        doc_dict = {}
+        ct3 = 0
+        for t in self._titles:
+            doc = doc_dict.get(t)
+            if doc:
+                doc_dict[t] = doc + self._documents[ct3]
+            else:
+                doc_dict[t] = self._documents[ct3]
+            ct3 += 1
+        self._titles.clear()
+        self._documents.clear()
+        for t in doc_dict.keys():
+            self._documents.append(doc_dict.get(t))
+            self._titles.append(t)
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 4ad331d..4a07298 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -8,7 +8,7 @@ def corpus_fixture():
     from src.qrmine import ReadData
     corpus = ReadData()
     file_path = resource_filename('src.qrmine.resources', 'interview.txt')
-    corpus.read_file([file_path])
+    corpus.read_file(file_path)
     return corpus
 
 # instannce of Qrmine as fixture
diff --git a/tests/test_readfiles.py b/tests/test_readfiles.py
index aff3a5d..963ed90 100644
--- a/tests/test_readfiles.py
+++ b/tests/test_readfiles.py
@@ -8,8 +8,8 @@ def corpus_fixture():
     from src.qrmine import ReadData
     corpus = ReadData()
     file_path = resource_filename('src.qrmine.resources', 'interview.txt')
-    corpus.read_file([file_path])
-    return corpus 
+    corpus.read_file(file_path)
+    return corpus
 
 
 def test_content(corpus_fixture):

From da92557311b3dcd0c514bc8ff0f94112ca44de41 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 16:59:29 +0000
Subject: [PATCH 06/35] feat: update ReadData class to handle URL input by
 storing content and appending to documents

---
 src/qrmine/readfiles.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py
index 522eaaa..a213ff7 100644
--- a/src/qrmine/readfiles.py
+++ b/src/qrmine/readfiles.py
@@ -73,12 +73,9 @@ def read_file(self, input):
             response = requests.get(input)
             if response.status_code == 200:
                 read_from_file = response.text
-                self._content = re.sub('<[^<]+?>', '', read_from_file)
-                self._documents = re.split('<break>.*?</break>', read_from_file)
-                # Delete the last blank record
-                del self._documents[-1]
-                pattern = r"<break>(.*?)</break>"
-                self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL)
+                self._content = read_from_file
+                self._documents.append(read_from_file)
+                self.titles.append(input)
         else:
             raise ValueError("Input must be a file name, folder name or url.")
 

From 9e2d7fb367c3466ae444c0302f8776c44b282f38 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 19:43:12 +0000
Subject: [PATCH 07/35] feat: add ClusterDocs class for semantic clustering and
 update tests for functionality

---
 pyproject.toml         |  1 +
 setup.cfg              |  1 +
 src/qrmine/__init__.py |  1 +
 src/qrmine/cluster.py  | 60 ++++++++++++++++++++++++++++++++++++++++++
 src/qrmine/content.py  |  4 +++
 test.py                | 33 +++++++++++++++++++++++
 tests/test_nlp.py      | 13 +++++++++
 7 files changed, 113 insertions(+)
 create mode 100644 src/qrmine/cluster.py
 create mode 100644 test.py

diff --git a/pyproject.toml b/pyproject.toml
index abba09a..58ddc98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
     "torch",
     "pypdf",
     "requests",
+    "gensim",
 ]
 dynamic = ["version"]
 
diff --git a/setup.cfg b/setup.cfg
index 82c523d..d832a6e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -67,6 +67,7 @@ install_requires =
     torch
     pypdf
     requests
+    gensim
 
 [options.packages.find]
 where = src
diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py
index 09a4e35..22d0eb9 100644
--- a/src/qrmine/__init__.py
+++ b/src/qrmine/__init__.py
@@ -6,6 +6,7 @@
 from .readfiles import ReadData
 from .sentiment import Sentiment
 from .mlqrmine import MLQRMine
+from .cluster import ClusterDocs
 
 if sys.version_info[:2] >= (3, 8):
     # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
new file mode 100644
index 0000000..e16266d
--- /dev/null
+++ b/src/qrmine/cluster.py
@@ -0,0 +1,60 @@
+import spacy
+from gensim import corpora
+from gensim.models.ldamodel import LdaModel
+
+class ClusterDocs:
+
+    def __init__(self, documents=[], titles=[]):
+        self._nlp = spacy.load("en_core_web_sm")
+        self._documents = documents
+        self._titles = titles
+        self._dictionary = None
+        self._corpus = None
+        # Apply preprocessing to each document
+        self._processed_docs = [self.preprocess(doc) for doc in documents]
+        self.process()
+
+    @property
+    def documents(self):
+        return self._documents
+
+    @property
+    def titles(self):
+        return self._titles
+
+    @documents.setter
+    def documents(self, documents):
+        self._documents = documents
+        self._processed_docs = [self.preprocess(doc) for doc in documents]
+        self.process()
+
+    @titles.setter
+    def titles(self, titles):
+        self._titles = titles
+
+    # Preprocess the documents using spaCy
+    def preprocess(self, doc):
+        # Tokenize and preprocess each document
+        doc = self._nlp(doc)
+        # Lemmatize and remove stop words
+        tokens = [token.lemma_ for token in doc if not token.is_stop]
+        return tokens
+
+    def process(self):
+        # Create a dictionary representation of the documents
+        self._dictionary = corpora.Dictionary(self._processed_docs)
+        # Create a bag-of-words representation of the documents
+        self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs]
+
+    def print_topics(self, num_topics=5, passes=15):
+        # Build the LDA (Latent Dirichlet Allocation) model
+        lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes)
+        # Print the topics and their corresponding words
+        print(lda_model.print_topics(num_words=5))
+
+    def print_clusters(self, num_topics=5, passes=15):
+        # Perform semantic clustering
+        lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes)
+        for i, doc in enumerate(self._processed_docs):  # Changed from get_processed_docs() to _documents
+            bow = self._dictionary.doc2bow(doc)
+            print(f"Document {self._titles[i]} belongs to topic: {lda_model.get_document_topics(bow)}")
diff --git a/src/qrmine/content.py b/src/qrmine/content.py
index 3344a80..f9e6b0e 100644
--- a/src/qrmine/content.py
+++ b/src/qrmine/content.py
@@ -87,6 +87,10 @@ def idx(self, token):
     def doc(self):
         return self._processed
 
+    @property
+    def tokens(self):
+        return [token for token in self._processed if not token.is_stop and not token.is_punct and not token.is_space]
+
     def process(self):
         for token in self._processed:
             if token.is_stop or token.is_digit or token.is_punct or token.is_space:
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..a5c4b31
--- /dev/null
+++ b/test.py
@@ -0,0 +1,33 @@
+import spacy
+
+# Load spaCy model
+nlp = spacy.load("en_core_web_sm")
+
+# Sample documents
+documents = [
+    "Natural language processing is a field of AI.",
+    "Topic modeling helps in uncovering the main themes in a collection of documents.",
+    "Semantic clustering groups similar documents together based on meaning.",
+    "SpaCy is a popular NLP library.",
+    "Gensim is commonly used for topic modeling.",
+]
+
+
+# Preprocess the documents using spaCy
+def preprocess(doc):
+    # Tokenize and preprocess each document
+    doc = nlp(doc)
+    print(f"Original Document: {doc}")
+    # Lemmatize and remove stop words
+    tokens = [token.lemma_ for token in doc if not token.is_stop]
+    print(f"Processed Tokens: {tokens}")
+    return tokens
+
+
+# Apply preprocessing to each document
+processed_docs = [preprocess(doc) for doc in documents]
+
+
+# Print the processed documents
+for i, doc in enumerate(processed_docs):
+    print(f"Document {i + 1}: {doc}")
\ No newline at end of file
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 4a07298..c94d03f 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -18,6 +18,12 @@ def q():
     _q = Qrmine()
     return _q
 
+@pytest.fixture
+def cluster():
+    from src.qrmine import ClusterDocs
+    _cluster = ClusterDocs()
+    return _cluster
+
 # Ref: https://docs.pytest.org/en/latest/capture.html
 def test_generate_dict(corpus_fixture, capsys, q):
     from src.qrmine import Content
@@ -50,6 +56,13 @@ def test_category_association(corpus_fixture, capsys, q):
     print(captured.out)
     assert 'theory' in captured.out
 
+def test_cluster_topics(corpus_fixture, capsys, cluster):
+    cluster.documents = corpus_fixture.documents
+    cluster.titles = corpus_fixture.titles
+    cluster.print_clusters()
+    captured = capsys.readouterr()
+    print(captured.out)
+    assert 'Document' in captured.out
 
 
 

From 829214c4daf8b9fa87060830431be36b46a40485 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:01:18 +0000
Subject: [PATCH 08/35] feat: enhance ClusterDocs class with num_topics and
 passes properties, update LDA model methods; add tests for topic printing

---
 src/qrmine/cluster.py | 42 ++++++++++++++++++++++++++++++++++--------
 tests/test_nlp.py     |  4 ++++
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
index e16266d..495f40d 100644
--- a/src/qrmine/cluster.py
+++ b/src/qrmine/cluster.py
@@ -1,15 +1,18 @@
 import spacy
 from gensim import corpora
 from gensim.models.ldamodel import LdaModel
-
+from pprint import pprint
 class ClusterDocs:
 
     def __init__(self, documents=[], titles=[]):
         self._nlp = spacy.load("en_core_web_sm")
         self._documents = documents
         self._titles = titles
+        self._num_topics = 5
+        self._passes = 15
         self._dictionary = None
         self._corpus = None
+        self._lda_model = None
         # Apply preprocessing to each document
         self._processed_docs = [self.preprocess(doc) for doc in documents]
         self.process()
@@ -22,6 +25,14 @@ def documents(self):
     def titles(self):
         return self._titles
 
+    @property
+    def num_topics(self):
+        return self._num_topics
+
+    @property
+    def passes(self):
+        return self._passes
+
     @documents.setter
     def documents(self, documents):
         self._documents = documents
@@ -32,6 +43,14 @@ def documents(self, documents):
     def titles(self, titles):
         self._titles = titles
 
+    @num_topics.setter
+    def num_topics(self, num_topics):
+        self._num_topics = num_topics
+
+    @passes.setter
+    def passes(self, passes):
+        self._passes = passes
+
     # Preprocess the documents using spaCy
     def preprocess(self, doc):
         # Tokenize and preprocess each document
@@ -45,16 +64,23 @@ def process(self):
         self._dictionary = corpora.Dictionary(self._processed_docs)
         # Create a bag-of-words representation of the documents
         self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs]
-
-    def print_topics(self, num_topics=5, passes=15):
         # Build the LDA (Latent Dirichlet Allocation) model
-        lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes)
+
+    def build_lda_model(self):
+        self._lda_model = LdaModel(
+            self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes
+        )
+
+    def print_topics(self, num_words=5):
+        if self._lda_model is None:
+            self.build_lda_model()
         # Print the topics and their corresponding words
-        print(lda_model.print_topics(num_words=5))
+        pprint(self._lda_model.print_topics(num_words=num_words))
 
-    def print_clusters(self, num_topics=5, passes=15):
+    def print_clusters(self):
+        if self._lda_model is None:
+            self.build_lda_model()
         # Perform semantic clustering
-        lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes)
         for i, doc in enumerate(self._processed_docs):  # Changed from get_processed_docs() to _documents
             bow = self._dictionary.doc2bow(doc)
-            print(f"Document {self._titles[i]} belongs to topic: {lda_model.get_document_topics(bow)}")
+            print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}")
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index c94d03f..4576da5 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -63,6 +63,10 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
     captured = capsys.readouterr()
     print(captured.out)
     assert 'Document' in captured.out
+    cluster.print_topics()
+    captured = capsys.readouterr()
+    print(captured.out)
+    assert 'topic' in captured.out
 
 
 

From 03def281510c6a14fd062050c51d89e3951119a7 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:18:25 +0000
Subject: [PATCH 09/35] feat: add format_topics_sentences method to ClusterDocs
 for topic formatting; update tests to validate output structure

---
 src/qrmine/cluster.py | 31 +++++++++++++++++++++++++++++++
 tests/test_nlp.py     | 17 +++++++++++++----
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
index 495f40d..fb622df 100644
--- a/src/qrmine/cluster.py
+++ b/src/qrmine/cluster.py
@@ -1,6 +1,7 @@
 import spacy
 from gensim import corpora
 from gensim.models.ldamodel import LdaModel
+import pandas as pd
 from pprint import pprint
 class ClusterDocs:
 
@@ -67,6 +68,8 @@ def process(self):
         # Build the LDA (Latent Dirichlet Allocation) model
 
     def build_lda_model(self):
+        if self._lda_model is not None:
+            return
         self._lda_model = LdaModel(
             self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes
         )
@@ -84,3 +87,31 @@ def print_clusters(self):
         for i, doc in enumerate(self._processed_docs):  # Changed from get_processed_docs() to _documents
             bow = self._dictionary.doc2bow(doc)
             print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}")
+
+
+    def format_topics_sentences(self):
+        self.build_lda_model()
+        # Init output
+        sent_topics_df = pd.DataFrame()
+
+        # Get main topic in each document
+        for i, row_list in enumerate(self._lda_model[self._corpus]):
+            row = row_list[0] if self._lda_model.per_word_topics else row_list
+            # print(row)
+            row = sorted(row, key=lambda x: (x[1]), reverse=True)
+            # Get the Dominant topic, Perc Contribution and Keywords for each document
+            for j, (topic_num, prop_topic) in enumerate(row):
+                if j == 0:  # => dominant topic
+                    wp = self._lda_model.show_topic(topic_num)
+                    topic_keywords = ", ".join([word for word, prop in wp])
+                    new_row = pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]],
+                                           columns=["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"])
+                    sent_topics_df = pd.concat([sent_topics_df, new_row], ignore_index=True)
+                else:
+                    break
+        sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]
+
+        # Add original text to the end of the output
+        contents = pd.Series(self._processed_docs)
+        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
+        return sent_topics_df
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 4576da5..9220aa9 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -1,7 +1,6 @@
 import pytest
 
 
-
 @pytest.fixture
 def corpus_fixture():
     from pkg_resources import resource_filename
@@ -67,6 +66,16 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
     captured = capsys.readouterr()
     print(captured.out)
     assert 'topic' in captured.out
-
-
-
+    # Format
+    df_topic_sents_keywords = cluster.format_topics_sentences()
+    # Format the output
+    df_dominant_topic = df_topic_sents_keywords.reset_index()
+    df_dominant_topic.columns = [
+        "Document_No",
+        "Dominant_Topic",
+        "Topic_Perc_Contrib",
+        "Keywords",
+        "Text",
+    ]
+    print(df_dominant_topic.head(10))
+    assert 'Document_No' in df_dominant_topic.columns

From 1fba6578f1570698dee2aa3dba805f6c0d0df53b Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:35:17 +0000
Subject: [PATCH 10/35] feat: add most_representative_docs method to
 ClusterDocs for retrieving top documents by topic; update tests for new
 functionality

---
 src/qrmine/cluster.py | 20 ++++++++++++++++++--
 tests/test_nlp.py     |  7 +++++--
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
index fb622df..34f583b 100644
--- a/src/qrmine/cluster.py
+++ b/src/qrmine/cluster.py
@@ -88,7 +88,6 @@ def print_clusters(self):
             bow = self._dictionary.doc2bow(doc)
             print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}")
 
-
     def format_topics_sentences(self):
         self.build_lda_model()
         # Init output
@@ -114,4 +113,21 @@ def format_topics_sentences(self):
         # Add original text to the end of the output
         contents = pd.Series(self._processed_docs)
         sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
-        return sent_topics_df
+        return sent_topics_df.reset_index(drop=False)
+
+    # https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
+    def most_representative_docs(self):
+        sent_topics_df = self.format_topics_sentences()
+        sent_topics_sorteddf_mallet = pd.DataFrame()
+        sent_topics_outdf_grpd = sent_topics_df.groupby("Dominant_Topic")
+
+        for i, grp in sent_topics_outdf_grpd:
+            sent_topics_sorteddf_mallet = pd.concat(
+                [
+                    sent_topics_sorteddf_mallet,
+                    grp.sort_values(["Perc_Contribution"], ascending=False).head(1),
+                ],
+                axis=0,
+            )
+
+        return sent_topics_sorteddf_mallet
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 9220aa9..c2ef347 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -67,9 +67,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
     print(captured.out)
     assert 'topic' in captured.out
     # Format
-    df_topic_sents_keywords = cluster.format_topics_sentences()
+    df_dominant_topic = cluster.format_topics_sentences()
     # Format the output
-    df_dominant_topic = df_topic_sents_keywords.reset_index()
     df_dominant_topic.columns = [
         "Document_No",
         "Dominant_Topic",
@@ -79,3 +78,7 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
     ]
     print(df_dominant_topic.head(10))
     assert 'Document_No' in df_dominant_topic.columns
+
+    df_sorted = cluster.most_representative_docs()
+    print(df_sorted.head(10))
+    assert 'Dominant_Topic' in df_sorted.columns
\ No newline at end of file

From 8402717c95e1f1b8934c9cfd977de0758f04c550 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:38:05 +0000
Subject: [PATCH 11/35] feat: add seaborn to dependencies and create
 visualize.py for data visualization

---
 pyproject.toml          | 1 +
 setup.cfg               | 1 +
 src/qrmine/visualize.py | 5 +++++
 3 files changed, 7 insertions(+)
 create mode 100644 src/qrmine/visualize.py

diff --git a/pyproject.toml b/pyproject.toml
index 58ddc98..4e87878 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
     "pypdf",
     "requests",
     "gensim",
+    "seaborn",
 ]
 dynamic = ["version"]
 
diff --git a/setup.cfg b/setup.cfg
index d832a6e..f0582c1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -68,6 +68,7 @@ install_requires =
     pypdf
     requests
     gensim
+    seaborn
 
 [options.packages.find]
 where = src
diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
new file mode 100644
index 0000000..fe24032
--- /dev/null
+++ b/src/qrmine/visualize.py
@@ -0,0 +1,5 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+

From d3f41bf208ef18206307a8402ebd29e59952584c Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:54:30 +0000
Subject: [PATCH 12/35] feat: enhance QRVisualize class with frequency
 distribution plotting and add corresponding tests

---
 src/qrmine/__init__.py                     |  1 +
 src/qrmine/resources/df_dominant_topic.csv | 12 +++++++++
 src/qrmine/visualize.py                    | 31 ++++++++++++++++++++++
 tests/test_visualize.py                    | 17 ++++++++++++
 4 files changed, 61 insertions(+)
 create mode 100644 src/qrmine/resources/df_dominant_topic.csv
 create mode 100644 tests/test_visualize.py

diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py
index 22d0eb9..3549721 100644
--- a/src/qrmine/__init__.py
+++ b/src/qrmine/__init__.py
@@ -7,6 +7,7 @@
 from .sentiment import Sentiment
 from .mlqrmine import MLQRMine
 from .cluster import ClusterDocs
+from .visualize import QRVisualize
 
 if sys.version_info[:2] >= (3, 8):
     # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
diff --git a/src/qrmine/resources/df_dominant_topic.csv b/src/qrmine/resources/df_dominant_topic.csv
new file mode 100644
index 0000000..115eb63
--- /dev/null
+++ b/src/qrmine/resources/df_dominant_topic.csv
@@ -0,0 +1,12 @@
+,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
+0,0,4,0.9903,"., GT, Strauss, ,, coding, 
+, ), Theory, seminal, (","['ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']"
+1,1,1,0.7811,",, theory, ., GT, evaluation, structure, coding, 
+, ), (","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']"
+2,2,1,0.9783,",, theory, ., GT, evaluation, structure, coding, 
+, ), (","['\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n']"
+3,3,3,0.9952,"., ,, coding, category, open, QRMine, datum, researcher, code, GT","['\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n']"
+4,4,4,0.9793,"., GT, Strauss, ,, coding, 
+, ), Theory, seminal, (","['\n', 'ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n']"
+5,5,2,0.9712,"category, comparison, incident, ,, 
+, involve, refine, identify, emergence, constant","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n']"
diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index fe24032..ecf9024 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -1,5 +1,36 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
+import numpy as np
 
+class QRVisualize:
+    def __init__(self, data: pd.DataFrame):
+        """
+        Initialize the QRVisualize class with a DataFrame.
 
+        Parameters:
+        data (pd.DataFrame): The DataFrame containing the data to visualize.
+        """
+        self.data = data
+
+    def plot_frequency_distribution_of_words(self, df, folder_path=None):
+        doc_lens = [len(d) for d in df.Text]
+
+        # Plot
+        plt.figure(figsize=(16,7), dpi=160)
+        plt.hist(doc_lens, bins = 1000, color='navy')
+        plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
+        plt.text(750,  90, "Median : " + str(round(np.median(doc_lens))))
+        plt.text(750,  80, "Stdev   : " + str(round(np.std(doc_lens))))
+        plt.text(750,  70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
+        plt.text(750,  60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))
+
+        plt.gca().set(xlim=(0, 1000), ylabel='Number of Documents', xlabel='Document Word Count')
+        plt.tick_params(size=16)
+        plt.xticks(np.linspace(0,1000,9))
+        plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
+        plt.show()
+        # save
+        if folder_path:
+            plt.savefig(folder_path)
+            plt.close()
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
new file mode 100644
index 0000000..58982e0
--- /dev/null
+++ b/tests/test_visualize.py
@@ -0,0 +1,17 @@
+import pytest
+import pandas as pd
+from src.qrmine.visualize import QRVisualize
+
+@pytest.fixture
+def v():
+    from pkg_resources import resource_filename
+    file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv")
+    data = pd.read_csv(file_path)
+    _v = QRVisualize(data)
+    return _v
+
+def test_frequency_distribution_of_words(v, capsys):
+    v.plot_frequency_distribution_of_words(v.data, folder_path='/tmp/frequency_distribution.png')
+    captured = capsys.readouterr()
+    print(captured.out)
+

From 38a8481f3cf505fff0854dd57fee2034daf69e40 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:56:14 +0000
Subject: [PATCH 13/35] feat: update QRVisualize constructor to accept optional
 DataFrame and modify plot method to use instance data if no DataFrame is
 provided

---
 src/qrmine/visualize.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index ecf9024..e9f3769 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 class QRVisualize:
-    def __init__(self, data: pd.DataFrame):
+    def __init__(self, data: pd.DataFrame = None):
         """
         Initialize the QRVisualize class with a DataFrame.
 
@@ -13,7 +13,9 @@ def __init__(self, data: pd.DataFrame):
         """
         self.data = data
 
-    def plot_frequency_distribution_of_words(self, df, folder_path=None):
+    def plot_frequency_distribution_of_words(self, df=None, folder_path=None):
+        if df is None:
+            df = self.data
         doc_lens = [len(d) for d in df.Text]
 
         # Plot

From 60572ef87706a706dedeadb7444e2dffc968a930 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 21:00:30 +0000
Subject: [PATCH 14/35] feat: add plot_distribution_by_topic method to
 QRVisualize for visualizing document word counts by dominant topic; update
 tests accordingly

---
 src/qrmine/visualize.py | 29 +++++++++++++++++++++++++++++
 tests/test_visualize.py |  4 ++++
 2 files changed, 33 insertions(+)

diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index e9f3769..3bb0548 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -2,6 +2,7 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
+import matplotlib.colors as mcolors
 
 class QRVisualize:
     def __init__(self, data: pd.DataFrame = None):
@@ -36,3 +37,31 @@ def plot_frequency_distribution_of_words(self, df=None, folder_path=None):
         if folder_path:
             plt.savefig(folder_path)
             plt.close()
+
+    def plot_distribution_by_topic(self, df=None, folder_path=None):
+        if df is None:
+            df = self.data
+        # Plot
+        cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
+
+        fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True)
+
+        for i, ax in enumerate(axes.flatten()):
+            df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :]
+            doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
+            ax.hist(doc_lens, bins = 1000, color=cols[i])
+            ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
+            sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
+            ax.set(xlim=(0, 1000), xlabel='Document Word Count')
+            ax.set_ylabel('Number of Documents', color=cols[i])
+            ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i]))
+
+        fig.tight_layout()
+        fig.subplots_adjust(top=0.90)
+        plt.xticks(np.linspace(0,1000,9))
+        fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
+        plt.show()
+        # save
+        if folder_path:
+            plt.savefig(folder_path)
+            plt.close()
\ No newline at end of file
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
index 58982e0..3f22d79 100644
--- a/tests/test_visualize.py
+++ b/tests/test_visualize.py
@@ -15,3 +15,7 @@ def test_frequency_distribution_of_words(v, capsys):
     captured = capsys.readouterr()
     print(captured.out)
 
+def test_distribution_by_topic(v, capsys):
+    v.plot_distribution_by_topic(v.data, folder_path='/tmp/distribution_by_topic.png')
+    captured = capsys.readouterr()
+    print(captured.out)

From 4d30f7cdaf9fe82890a34bb0b7903e67de87f6b9 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 21:01:46 +0000
Subject: [PATCH 15/35] feat: add wordcloud to dependencies for enhanced
 visualization capabilities

---
 pyproject.toml | 1 +
 setup.cfg      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 4e87878..03ef32a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "requests",
     "gensim",
     "seaborn",
+    "wordcloud",
 ]
 dynamic = ["version"]
 
diff --git a/setup.cfg b/setup.cfg
index f0582c1..c85767c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -69,6 +69,7 @@ install_requires =
     requests
     gensim
     seaborn
+    wordcloud
 
 [options.packages.find]
 where = src

From 28ee69bacbf15d7a235947c734a51e22a444a233 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 21:12:45 +0000
Subject: [PATCH 16/35] feat: refactor build_lda_model method to return topics
 and improve logic; add topics fixture for testing

---
 src/qrmine/cluster.py   | 12 +++----
 src/qrmine/visualize.py |  4 ++-
 tests/test_nlp.py       |  2 ++
 tests/test_visualize.py | 80 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
index 34f583b..af36476 100644
--- a/src/qrmine/cluster.py
+++ b/src/qrmine/cluster.py
@@ -68,12 +68,12 @@ def process(self):
         # Build the LDA (Latent Dirichlet Allocation) model
 
     def build_lda_model(self):
-        if self._lda_model is not None:
-            return
-        self._lda_model = LdaModel(
-            self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes
-        )
-
+        if self._lda_model is None:
+            self._lda_model = LdaModel(
+                self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes
+            )
+        return self._lda_model.show_topics(formatted=False)
+    
     def print_topics(self, num_words=5):
         if self._lda_model is None:
             self.build_lda_model()
diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index 3bb0548..04531cb 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -3,6 +3,8 @@
 import seaborn as sns
 import numpy as np
 import matplotlib.colors as mcolors
+from wordcloud import WordCloud, STOPWORDS
+
 
 class QRVisualize:
     def __init__(self, data: pd.DataFrame = None):
@@ -64,4 +66,4 @@ def plot_distribution_by_topic(self, df=None, folder_path=None):
         # save
         if folder_path:
             plt.savefig(folder_path)
-            plt.close()
\ No newline at end of file
+            plt.close()
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index c2ef347..ad82495 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -66,6 +66,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
     captured = capsys.readouterr()
     print(captured.out)
     assert 'topic' in captured.out
+
+    print(cluster.build_lda_model())
     # Format
     df_dominant_topic = cluster.format_topics_sentences()
     # Format the output
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
index 3f22d79..67105ee 100644
--- a/tests/test_visualize.py
+++ b/tests/test_visualize.py
@@ -10,6 +10,86 @@ def v():
     _v = QRVisualize(data)
     return _v
 
+@pytest.fixture
+def topics():
+    return [
+        (
+            0,
+            [
+                (".", 0.095292516),
+                (",", 0.053392828),
+                ("category", 0.032462463),
+                ("coding", 0.032456465),
+                ("open", 0.032437164),
+                ("QRMine", 0.03243305),
+                ("datum", 0.021980358),
+                ("researcher", 0.021978099),
+                ("theory", 0.011536299),
+                ("GT", 0.011533132),
+            ],
+        ),
+        (
+            1,
+            [
+                (".", 0.007783216),
+                (",", 0.007773952),
+                ("open", 0.007728422),
+                ("researcher", 0.0077227736),
+                ("coding", 0.007722049),
+                ("category", 0.007721938),
+                ("datum", 0.007717547),
+                ("QRMine", 0.007716193),
+                ("dissect", 0.0077070068),
+                ("support", 0.0077060354),
+            ],
+        ),
+        (
+            2,
+            [
+                (",", 0.05126711),
+                (".", 0.05125151),
+                ("theory", 0.038604487),
+                ("category", 0.03227912),
+                ("GT", 0.032278605),
+                ("\n", 0.029119665),
+                ("comparison", 0.025947908),
+                ("coding", 0.025941858),
+                ("incident", 0.019622542),
+                (")", 0.019619444),
+            ],
+        ),
+        (
+            3,
+            [
+                (".", 0.007849805),
+                (",", 0.007837688),
+                ("theory", 0.00781459),
+                ("coding", 0.0078089647),
+                ("category", 0.0077514737),
+                ("GT", 0.0077493717),
+                ("datum", 0.007742789),
+                ("open", 0.0077355755),
+                ("\n", 0.0077245855),
+                ("researcher", 0.0077191954),
+            ],
+        ),
+        (
+            4,
+            [
+                (",", 0.007834569),
+                (".", 0.007812336),
+                ("coding", 0.0077863215),
+                ("category", 0.007759207),
+                ("theory", 0.0077459146),
+                ("GT", 0.0077370973),
+                ("code", 0.0077265715),
+                ("datum", 0.007720947),
+                ("open", 0.007720898),
+                ("comparison", 0.007720567),
+            ],
+        ),
+    ]
+
 def test_frequency_distribution_of_words(v, capsys):
     v.plot_frequency_distribution_of_words(v.data, folder_path='/tmp/frequency_distribution.png')
     captured = capsys.readouterr()

From 643d4698b233e89257e711051505ac486572b9c4 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 21:21:53 +0000
Subject: [PATCH 17/35] feat: add plot_wordcloud method to QRVisualize for
 visualizing topics; include corresponding test

---
 src/qrmine/visualize.py | 32 ++++++++++++++++++++++++++++++++
 tests/test_visualize.py |  5 +++++
 2 files changed, 37 insertions(+)

diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index 04531cb..49dfc05 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -67,3 +67,35 @@ def plot_distribution_by_topic(self, df=None, folder_path=None):
         if folder_path:
             plt.savefig(folder_path)
             plt.close()
+
+    def plot_wordcloud(self, topics=None, folder_path=None):
+        cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
+
+        cloud = WordCloud(stopwords=STOPWORDS,
+                        background_color='white',
+                        width=2500,
+                        height=1800,
+                        max_words=10,
+                        colormap='tab10',
+                        color_func=lambda *args, **kwargs: cols[i],
+                        prefer_horizontal=1.0)
+
+        fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)
+
+        for i, ax in enumerate(axes.flatten()):
+            fig.add_subplot(ax)
+            topic_words = dict(topics[i][1])
+            cloud.generate_from_frequencies(topic_words, max_font_size=300)
+            plt.gca().imshow(cloud)
+            plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
+            plt.gca().axis('off')
+
+        plt.subplots_adjust(wspace=0, hspace=0)
+        plt.axis("off")
+        plt.margins(x=0, y=0)
+        plt.tight_layout()
+        plt.show()
+        # save
+        if folder_path:
+            plt.savefig(folder_path)
+            plt.close()
\ No newline at end of file
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
index 67105ee..ae35c39 100644
--- a/tests/test_visualize.py
+++ b/tests/test_visualize.py
@@ -99,3 +99,8 @@ def test_distribution_by_topic(v, capsys):
     v.plot_distribution_by_topic(v.data, folder_path='/tmp/distribution_by_topic.png')
     captured = capsys.readouterr()
     print(captured.out)
+
+def test_plot_wordcloud(v, topics, capsys):
+    v.plot_wordcloud(topics, folder_path='/tmp/wordcloud.png')
+    captured = capsys.readouterr()
+    print(captured.out)

From bb5699655fdfba9497c37f1b65d74aac3492e16d Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 21:25:51 +0000
Subject: [PATCH 18/35] feat: update plot_wordcloud method parameters for
 improved visualization; reduce size and max words

---
 src/qrmine/visualize.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index 49dfc05..6daca87 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -73,9 +73,9 @@ def plot_wordcloud(self, topics=None, folder_path=None):
 
         cloud = WordCloud(stopwords=STOPWORDS,
                         background_color='white',
-                        width=2500,
-                        height=1800,
-                        max_words=10,
+                        width=250,
+                        height=180,
+                        max_words=5,
                         colormap='tab10',
                         color_func=lambda *args, **kwargs: cols[i],
                         prefer_horizontal=1.0)

From fc49b94be83c4c56b5bea6888d3f9a70cf33bcdf Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 21:46:33 +0000
Subject: [PATCH 19/35] feat: add processed_docs property and
 topics_per_document method to ClusterDocs; enhance document processing
 capabilities

---
 src/qrmine/cluster.py   |  18 +++++-
 src/qrmine/visualize.py | 134 +++++++++++++++++++++++++++++++++++++++-
 tests/test_nlp.py       |   2 +
 3 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
index af36476..4370d8a 100644
--- a/src/qrmine/cluster.py
+++ b/src/qrmine/cluster.py
@@ -34,6 +34,10 @@ def num_topics(self):
     def passes(self):
         return self._passes
 
+    @property
+    def processed_docs(self):
+        return self._processed_docs
+
     @documents.setter
     def documents(self, documents):
         self._documents = documents
@@ -73,7 +77,7 @@ def build_lda_model(self):
                 self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes
             )
         return self._lda_model.show_topics(formatted=False)
-    
+
     def print_topics(self, num_words=5):
         if self._lda_model is None:
             self.build_lda_model()
@@ -131,3 +135,15 @@ def most_representative_docs(self):
             )
 
         return sent_topics_sorteddf_mallet
+
+
+    def topics_per_document(self, start=0, end=1):
+        corpus_sel = self._corpus[start:end]
+        dominant_topics = []
+        topic_percentages = []
+        for i, corp in enumerate(corpus_sel):
+            topic_percs = self._lda_model[corp]
+            dominant_topic = sorted(topic_percs, key=lambda x: x[1], reverse=True)[0][0]
+            dominant_topics.append((i, dominant_topic))
+            topic_percentages.append(topic_percs)
+        return (dominant_topics, topic_percentages)
diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index 6daca87..bce60e5 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -4,7 +4,9 @@
 import numpy as np
 import matplotlib.colors as mcolors
 from wordcloud import WordCloud, STOPWORDS
-
+from collections import Counter
+from matplotlib.patches import Rectangle
+from sklearn.manifold import TSNE
 
 class QRVisualize:
     def __init__(self, data: pd.DataFrame = None):
@@ -98,4 +100,132 @@ def plot_wordcloud(self, topics=None, folder_path=None):
         # save
         if folder_path:
             plt.savefig(folder_path)
-            plt.close()
\ No newline at end of file
+            plt.close()
+
+    def plot_importance(self, topics=None, processed_docs=None, folder_path=None):
+        data_flat = [w for w_list in processed_docs for w in w_list]
+        counter = Counter(data_flat)
+
+        out = []
+        for i, topic in topics:
+            for word, weight in topic:
+                out.append([word, i, weight, counter[word]])
+
+        df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"])
+
+        # Plot Word Count and Weights of Topic Keywords
+        fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160)
+        cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
+        for i, ax in enumerate(axes.flatten()):
+            ax.bar(
+                x="word",
+                height="word_count",
+                data=df.loc[df.topic_id == i, :],
+                color=cols[i],
+                width=0.5,
+                alpha=0.3,
+                label="Word Count",
+            )
+            ax_twin = ax.twinx()
+            ax_twin.bar(
+                x="word",
+                height="importance",
+                data=df.loc[df.topic_id == i, :],
+                color=cols[i],
+                width=0.2,
+                label="Weights",
+            )
+            ax.set_ylabel("Word Count", color=cols[i])
+            ax_twin.set_ylim(0, 0.030)
+            ax.set_ylim(0, 3500)
+            ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16)
+            ax.tick_params(axis="y", left=False)
+            ax.set_xticklabels(
+                df.loc[df.topic_id == i, "word"], rotation=30, horizontalalignment="right"
+            )
+            ax.legend(loc="upper left")
+            ax_twin.legend(loc="upper right")
+
+        fig.tight_layout(w_pad=2)
+        fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05)
+        plt.show()
+        # save
+        if folder_path:
+            plt.savefig(folder_path)
+            plt.close()
+
+
+    def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13):
+        corp = corpus[start:end]
+        mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()]
+
+        fig, axes = plt.subplots(end-start, 1, figsize=(20, (end-start)*0.95), dpi=160)
+        axes[0].axis('off')
+        for i, ax in enumerate(axes):
+            if i > 0:
+                corp_cur = corp[i-1]
+                topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur]
+                word_dominanttopic = [(lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics]
+                ax.text(0.01, 0.5, "Doc " + str(i-1) + ": ", verticalalignment='center',
+                        fontsize=16, color='black', transform=ax.transAxes, fontweight=700)
+
+                # Draw Rectange
+                topic_percs_sorted = sorted(topic_percs, key=lambda x: (x[1]), reverse=True)
+                ax.add_patch(Rectangle((0.0, 0.05), 0.99, 0.90, fill=None, alpha=1,
+                                    color=mycolors[topic_percs_sorted[0][0]], linewidth=2))
+
+                word_pos = 0.06
+                for j, (word, topics) in enumerate(word_dominanttopic):
+                    if j < 14:
+                        ax.text(word_pos, 0.5, word,
+                                horizontalalignment='left',
+                                verticalalignment='center',
+                                fontsize=16, color=mycolors[topics],
+                                transform=ax.transAxes, fontweight=700)
+                        word_pos += .009 * len(word)  # to move the word for the next iter
+                        ax.axis('off')
+                ax.text(word_pos, 0.5, '. . .',
+                        horizontalalignment='left',
+                        verticalalignment='center',
+                        fontsize=16, color='black',
+                        transform=ax.transAxes)
+
+        plt.subplots_adjust(wspace=0, hspace=0)
+        plt.suptitle('Sentence Topic Coloring for Documents: ' + str(start) + ' to ' + str(end-2), fontsize=22, y=0.95, fontweight=700)
+        plt.tight_layout()
+        plt.show()
+
+    def cluster_chart(self, lda_model=None, corpus=None, n_topics=4, folder_path=None):
+        # Get topic weights
+        topic_weights = []
+        for i, row_list in enumerate(lda_model[corpus]):
+            topic_weights.append([w for i, w in row_list[0]])
+
+        # Array of topic weights
+        arr = pd.DataFrame(topic_weights).fillna(0).values
+
+        # Keep the well separated points (optional)
+        arr = arr[np.amax(arr, axis=1) > 0.35]
+
+        # Dominant topic number in each doc
+        topic_num = np.argmax(arr, axis=1)
+
+        # tSNE Dimension Reduction
+        tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
+        tsne_lda = tsne_model.fit_transform(arr)
+
+
+        # Plot
+        plt.figure(figsize=(16, 10), dpi=160)
+        for i in range(n_topics):
+            plt.scatter(tsne_lda[topic_num == i, 0], tsne_lda[topic_num == i, 1], label=str(i), alpha=0.5)
+        plt.title('t-SNE Clustering of Topics', fontsize=22)
+        plt.xlabel('t-SNE Dimension 1', fontsize=16)
+        plt.ylabel('t-SNE Dimension 2', fontsize=16)
+        plt.legend(title='Topic Number', loc='upper right')
+        plt.show()
+        # save
+        if folder_path:
+            plt.savefig(folder_path)
+            plt.close()
+
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index ad82495..17b4c52 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -68,6 +68,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
     assert 'topic' in captured.out
 
     print(cluster.build_lda_model())
+
+    print(cluster.topics_per_document())
     # Format
     df_dominant_topic = cluster.format_topics_sentences()
     # Format the output

From ddc6b96a025000ae6969a0919c44477783236669 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 21:56:32 +0000
Subject: [PATCH 20/35] feat: add copyright notice and improve code formatting
 in cluster and visualize modules; enhance test readability in test files

---
 src/qrmine/cluster.py   |  60 ++++++--
 src/qrmine/visualize.py | 297 ++++++++++++++++++++++++++++++----------
 tests/test_nlp.py       |  29 ++--
 tests/test_visualize.py |  14 +-
 4 files changed, 309 insertions(+), 91 deletions(-)

diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py
index 4370d8a..3e68ac3 100644
--- a/src/qrmine/cluster.py
+++ b/src/qrmine/cluster.py
@@ -1,8 +1,30 @@
+"""
+Copyright (C) 2025 Bell Eapen
+
+This file is part of qrmine.
+
+qrmine is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+qrmine is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with qrmine.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+from pprint import pprint
+
+import pandas as pd
 import spacy
 from gensim import corpora
 from gensim.models.ldamodel import LdaModel
-import pandas as pd
-from pprint import pprint
+
+
 class ClusterDocs:
 
     def __init__(self, documents=[], titles=[]):
@@ -74,7 +96,10 @@ def process(self):
     def build_lda_model(self):
         if self._lda_model is None:
             self._lda_model = LdaModel(
-                self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes
+                self._corpus,
+                num_topics=self._num_topics,
+                id2word=self._dictionary,
+                passes=self._passes,
             )
         return self._lda_model.show_topics(formatted=False)
 
@@ -88,9 +113,13 @@ def print_clusters(self):
         if self._lda_model is None:
             self.build_lda_model()
         # Perform semantic clustering
-        for i, doc in enumerate(self._processed_docs):  # Changed from get_processed_docs() to _documents
+        for i, doc in enumerate(
+            self._processed_docs
+        ):  # Changed from get_processed_docs() to _documents
             bow = self._dictionary.doc2bow(doc)
-            print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}")
+            print(
+                f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}"
+            )
 
     def format_topics_sentences(self):
         self.build_lda_model()
@@ -107,12 +136,24 @@ def format_topics_sentences(self):
                 if j == 0:  # => dominant topic
                     wp = self._lda_model.show_topic(topic_num)
                     topic_keywords = ", ".join([word for word, prop in wp])
-                    new_row = pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]],
-                                           columns=["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"])
-                    sent_topics_df = pd.concat([sent_topics_df, new_row], ignore_index=True)
+                    new_row = pd.DataFrame(
+                        [[int(topic_num), round(prop_topic, 4), topic_keywords]],
+                        columns=[
+                            "Dominant_Topic",
+                            "Perc_Contribution",
+                            "Topic_Keywords",
+                        ],
+                    )
+                    sent_topics_df = pd.concat(
+                        [sent_topics_df, new_row], ignore_index=True
+                    )
                 else:
                     break
-        sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]
+        sent_topics_df.columns = [
+            "Dominant_Topic",
+            "Perc_Contribution",
+            "Topic_Keywords",
+        ]
 
         # Add original text to the end of the output
         contents = pd.Series(self._processed_docs)
@@ -136,7 +177,6 @@ def most_representative_docs(self):
 
         return sent_topics_sorteddf_mallet
 
-
     def topics_per_document(self, start=0, end=1):
         corpus_sel = self._corpus[start:end]
         dominant_topics = []
diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py
index bce60e5..4a7fc25 100644
--- a/src/qrmine/visualize.py
+++ b/src/qrmine/visualize.py
@@ -1,12 +1,34 @@
-import pandas as pd
+"""
+Copyright (C) 2025 Bell Eapen
+
+This file is part of qrmine.
+
+qrmine is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+qrmine is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with qrmine.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+from collections import Counter
+
+import matplotlib.colors as mcolors
 import matplotlib.pyplot as plt
-import seaborn as sns
 import numpy as np
-import matplotlib.colors as mcolors
-from wordcloud import WordCloud, STOPWORDS
-from collections import Counter
+import pandas as pd
+import seaborn as sns
 from matplotlib.patches import Rectangle
+from matplotlib.ticker import FuncFormatter
 from sklearn.manifold import TSNE
+from wordcloud import STOPWORDS, WordCloud
+
 
 class QRVisualize:
     def __init__(self, data: pd.DataFrame = None):
@@ -24,18 +46,20 @@ def plot_frequency_distribution_of_words(self, df=None, folder_path=None):
         doc_lens = [len(d) for d in df.Text]
 
         # Plot
-        plt.figure(figsize=(16,7), dpi=160)
-        plt.hist(doc_lens, bins = 1000, color='navy')
+        plt.figure(figsize=(16, 7), dpi=160)
+        plt.hist(doc_lens, bins=1000, color="navy")
         plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
-        plt.text(750,  90, "Median : " + str(round(np.median(doc_lens))))
-        plt.text(750,  80, "Stdev   : " + str(round(np.std(doc_lens))))
-        plt.text(750,  70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
-        plt.text(750,  60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))
-
-        plt.gca().set(xlim=(0, 1000), ylabel='Number of Documents', xlabel='Document Word Count')
+        plt.text(750, 90, "Median : " + str(round(np.median(doc_lens))))
+        plt.text(750, 80, "Stdev   : " + str(round(np.std(doc_lens))))
+        plt.text(750, 70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
+        plt.text(750, 60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))
+
+        plt.gca().set(
+            xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count"
+        )
         plt.tick_params(size=16)
-        plt.xticks(np.linspace(0,1000,9))
-        plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
+        plt.xticks(np.linspace(0, 1000, 9))
+        plt.title("Distribution of Document Word Counts", fontdict=dict(size=22))
         plt.show()
         # save
         if folder_path:
@@ -46,24 +70,30 @@ def plot_distribution_by_topic(self, df=None, folder_path=None):
         if df is None:
             df = self.data
         # Plot
-        cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
+        cols = [
+            color for name, color in mcolors.TABLEAU_COLORS.items()
+        ]  # more colors: 'mcolors.XKCD_COLORS'
 
-        fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True)
+        fig, axes = plt.subplots(
+            2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True
+        )
 
         for i, ax in enumerate(axes.flatten()):
             df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :]
             doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
-            ax.hist(doc_lens, bins = 1000, color=cols[i])
-            ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
+            ax.hist(doc_lens, bins=1000, color=cols[i])
+            ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i])
             sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
-            ax.set(xlim=(0, 1000), xlabel='Document Word Count')
-            ax.set_ylabel('Number of Documents', color=cols[i])
-            ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i]))
+            ax.set(xlim=(0, 1000), xlabel="Document Word Count")
+            ax.set_ylabel("Number of Documents", color=cols[i])
+            ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i]))
 
         fig.tight_layout()
         fig.subplots_adjust(top=0.90)
-        plt.xticks(np.linspace(0,1000,9))
-        fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
+        plt.xticks(np.linspace(0, 1000, 9))
+        fig.suptitle(
+            "Distribution of Document Word Counts by Dominant Topic", fontsize=22
+        )
         plt.show()
         # save
         if folder_path:
@@ -71,26 +101,30 @@ def plot_distribution_by_topic(self, df=None, folder_path=None):
             plt.close()
 
     def plot_wordcloud(self, topics=None, folder_path=None):
-        cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
-
-        cloud = WordCloud(stopwords=STOPWORDS,
-                        background_color='white',
-                        width=250,
-                        height=180,
-                        max_words=5,
-                        colormap='tab10',
-                        color_func=lambda *args, **kwargs: cols[i],
-                        prefer_horizontal=1.0)
-
-        fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)
+        cols = [
+            color for name, color in mcolors.TABLEAU_COLORS.items()
+        ]  # more colors: 'mcolors.XKCD_COLORS'
+
+        cloud = WordCloud(
+            stopwords=STOPWORDS,
+            background_color="white",
+            width=250,
+            height=180,
+            max_words=5,
+            colormap="tab10",
+            color_func=lambda *args, **kwargs: cols[i],
+            prefer_horizontal=1.0,
+        )
+
+        fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)
 
         for i, ax in enumerate(axes.flatten()):
             fig.add_subplot(ax)
             topic_words = dict(topics[i][1])
             cloud.generate_from_frequencies(topic_words, max_font_size=300)
             plt.gca().imshow(cloud)
-            plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
-            plt.gca().axis('off')
+            plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16))
+            plt.gca().axis("off")
 
         plt.subplots_adjust(wspace=0, hspace=0)
         plt.axis("off")
@@ -141,7 +175,9 @@ def plot_importance(self, topics=None, processed_docs=None, folder_path=None):
             ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16)
             ax.tick_params(axis="y", left=False)
             ax.set_xticklabels(
-                df.loc[df.topic_id == i, "word"], rotation=30, horizontalalignment="right"
+                df.loc[df.topic_id == i, "word"],
+                rotation=30,
+                horizontalalignment="right",
             )
             ax.legend(loc="upper left")
             ax_twin.legend(loc="upper right")
@@ -154,44 +190,87 @@ def plot_importance(self, topics=None, processed_docs=None, folder_path=None):
             plt.savefig(folder_path)
             plt.close()
 
-
     def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13):
         corp = corpus[start:end]
         mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()]
 
-        fig, axes = plt.subplots(end-start, 1, figsize=(20, (end-start)*0.95), dpi=160)
-        axes[0].axis('off')
+        fig, axes = plt.subplots(
+            end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160
+        )
+        axes[0].axis("off")
         for i, ax in enumerate(axes):
             if i > 0:
-                corp_cur = corp[i-1]
+                corp_cur = corp[i - 1]
                 topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur]
-                word_dominanttopic = [(lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics]
-                ax.text(0.01, 0.5, "Doc " + str(i-1) + ": ", verticalalignment='center',
-                        fontsize=16, color='black', transform=ax.transAxes, fontweight=700)
+                word_dominanttopic = [
+                    (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics
+                ]
+                ax.text(
+                    0.01,
+                    0.5,
+                    "Doc " + str(i - 1) + ": ",
+                    verticalalignment="center",
+                    fontsize=16,
+                    color="black",
+                    transform=ax.transAxes,
+                    fontweight=700,
+                )
 
                 # Draw Rectange
-                topic_percs_sorted = sorted(topic_percs, key=lambda x: (x[1]), reverse=True)
-                ax.add_patch(Rectangle((0.0, 0.05), 0.99, 0.90, fill=None, alpha=1,
-                                    color=mycolors[topic_percs_sorted[0][0]], linewidth=2))
+                topic_percs_sorted = sorted(
+                    topic_percs, key=lambda x: (x[1]), reverse=True
+                )
+                ax.add_patch(
+                    Rectangle(
+                        (0.0, 0.05),
+                        0.99,
+                        0.90,
+                        fill=None,
+                        alpha=1,
+                        color=mycolors[topic_percs_sorted[0][0]],
+                        linewidth=2,
+                    )
+                )
 
                 word_pos = 0.06
                 for j, (word, topics) in enumerate(word_dominanttopic):
                     if j < 14:
-                        ax.text(word_pos, 0.5, word,
-                                horizontalalignment='left',
-                                verticalalignment='center',
-                                fontsize=16, color=mycolors[topics],
-                                transform=ax.transAxes, fontweight=700)
-                        word_pos += .009 * len(word)  # to move the word for the next iter
-                        ax.axis('off')
-                ax.text(word_pos, 0.5, '. . .',
-                        horizontalalignment='left',
-                        verticalalignment='center',
-                        fontsize=16, color='black',
-                        transform=ax.transAxes)
+                        ax.text(
+                            word_pos,
+                            0.5,
+                            word,
+                            horizontalalignment="left",
+                            verticalalignment="center",
+                            fontsize=16,
+                            color=mycolors[topics],
+                            transform=ax.transAxes,
+                            fontweight=700,
+                        )
+                        word_pos += 0.009 * len(
+                            word
+                        )  # to move the word for the next iter
+                        ax.axis("off")
+                ax.text(
+                    word_pos,
+                    0.5,
+                    ". . .",
+                    horizontalalignment="left",
+                    verticalalignment="center",
+                    fontsize=16,
+                    color="black",
+                    transform=ax.transAxes,
+                )
 
         plt.subplots_adjust(wspace=0, hspace=0)
-        plt.suptitle('Sentence Topic Coloring for Documents: ' + str(start) + ' to ' + str(end-2), fontsize=22, y=0.95, fontweight=700)
+        plt.suptitle(
+            "Sentence Topic Coloring for Documents: "
+            + str(start)
+            + " to "
+            + str(end - 2),
+            fontsize=22,
+            y=0.95,
+            fontweight=700,
+        )
         plt.tight_layout()
         plt.show()
 
@@ -211,21 +290,101 @@ def cluster_chart(self, lda_model=None, corpus=None, n_topics=4, folder_path=Non
         topic_num = np.argmax(arr, axis=1)
 
         # tSNE Dimension Reduction
-        tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
+        tsne_model = TSNE(
+            n_components=2, verbose=1, random_state=0, angle=0.99, init="pca"
+        )
         tsne_lda = tsne_model.fit_transform(arr)
 
-
         # Plot
         plt.figure(figsize=(16, 10), dpi=160)
         for i in range(n_topics):
-            plt.scatter(tsne_lda[topic_num == i, 0], tsne_lda[topic_num == i, 1], label=str(i), alpha=0.5)
-        plt.title('t-SNE Clustering of Topics', fontsize=22)
-        plt.xlabel('t-SNE Dimension 1', fontsize=16)
-        plt.ylabel('t-SNE Dimension 2', fontsize=16)
-        plt.legend(title='Topic Number', loc='upper right')
+            plt.scatter(
+                tsne_lda[topic_num == i, 0],
+                tsne_lda[topic_num == i, 1],
+                label=str(i),
+                alpha=0.5,
+            )
+        plt.title("t-SNE Clustering of Topics", fontsize=22)
+        plt.xlabel("t-SNE Dimension 1", fontsize=16)
+        plt.ylabel("t-SNE Dimension 2", fontsize=16)
+        plt.legend(title="Topic Number", loc="upper right")
         plt.show()
         # save
         if folder_path:
             plt.savefig(folder_path)
             plt.close()
 
+    def most_discussed_topics(
+        self, lda_model, dominant_topics, topic_percentages, folder_path=None
+    ):
+
+        # Distribution of Dominant Topics in Each Document
+        df = pd.DataFrame(dominant_topics, columns=["Document_Id", "Dominant_Topic"])
+        dominant_topic_in_each_doc = df.groupby("Dominant_Topic").size()
+        df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(
+            name="count"
+        ).reset_index()
+
+        # Total Topic Distribution by actual weight
+        topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
+        df_topic_weightage_by_doc = (
+            topic_weightage_by_doc.sum().to_frame(name="count").reset_index()
+        )
+
+        # Top 3 Keywords for each Topic
+        topic_top3words = [
+            (i, topic)
+            for i, topics in lda_model.show_topics(formatted=False)
+            for j, (topic, wt) in enumerate(topics)
+            if j < 3
+        ]
+
+        df_top3words_stacked = pd.DataFrame(
+            topic_top3words, columns=["topic_id", "words"]
+        )
+        df_top3words = df_top3words_stacked.groupby("topic_id").agg(", \n".join)
+        df_top3words.reset_index(level=0, inplace=True)
+
+        # Plot
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True)
+
+        # Topic Distribution by Dominant Topics
+        ax1.bar(
+            x="Dominant_Topic",
+            height="count",
+            data=df_dominant_topic_in_each_doc,
+            width=0.5,
+            color="firebrick",
+        )
+        ax1.set_xticks(
+            range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__())
+        )
+        tick_formatter = FuncFormatter(
+            lambda x, pos: "Topic "
+            + str(x)
+            + "\n"
+            + df_top3words.loc[df_top3words.topic_id == x, "words"].values[0]
+        )
+        ax1.xaxis.set_major_formatter(tick_formatter)
+        ax1.set_title("Number of Documents by Dominant Topic", fontdict=dict(size=10))
+        ax1.set_ylabel("Number of Documents")
+        ax1.set_ylim(0, 1000)
+
+        # Topic Distribution by Topic Weights
+        ax2.bar(
+            x="index",
+            height="count",
+            data=df_topic_weightage_by_doc,
+            width=0.5,
+            color="steelblue",
+        )
+        ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
+        ax2.xaxis.set_major_formatter(tick_formatter)
+        ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10))
+
+        plt.show()
+
+        # save
+        if folder_path:
+            plt.savefig(folder_path)
+            plt.close()
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index 17b4c52..6c922a5 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -5,33 +5,41 @@
 def corpus_fixture():
     from pkg_resources import resource_filename
     from src.qrmine import ReadData
+
     corpus = ReadData()
-    file_path = resource_filename('src.qrmine.resources', 'interview.txt')
+    file_path = resource_filename("src.qrmine.resources", "interview.txt")
     corpus.read_file(file_path)
     return corpus
 
+
 # instannce of Qrmine as fixture
 @pytest.fixture
 def q():
     from src.qrmine import Qrmine
+
     _q = Qrmine()
     return _q
 
+
 @pytest.fixture
 def cluster():
     from src.qrmine import ClusterDocs
+
     _cluster = ClusterDocs()
     return _cluster
 
+
 # Ref: https://docs.pytest.org/en/latest/capture.html
 def test_generate_dict(corpus_fixture, capsys, q):
     from src.qrmine import Content
+
     num = 10
     all_interviews = Content(corpus_fixture.content)
     q.print_dict(all_interviews, num)
     captured = capsys.readouterr()
     print(captured.out)
-    assert 'code' in captured.out
+    assert "code" in captured.out
+
 
 def test_generate_topics(corpus_fixture, capsys, q):
     q.content = corpus_fixture
@@ -39,21 +47,24 @@ def test_generate_topics(corpus_fixture, capsys, q):
     q.print_topics()
     captured = capsys.readouterr()
     print(captured.out)
-    assert 'TOPIC' in captured.out
+    assert "TOPIC" in captured.out
+
 
 def test_category_basket(corpus_fixture, capsys, q):
     q.content = corpus_fixture
     print(q.category_basket())
     captured = capsys.readouterr()
     print(captured.out)
-    assert 'theory' in captured.out
+    assert "theory" in captured.out
+
 
 def test_category_association(corpus_fixture, capsys, q):
     q.content = corpus_fixture
     print(q.category_association())
     captured = capsys.readouterr()
     print(captured.out)
-    assert 'theory' in captured.out
+    assert "theory" in captured.out
+
 
 def test_cluster_topics(corpus_fixture, capsys, cluster):
     cluster.documents = corpus_fixture.documents
@@ -61,11 +72,11 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
     cluster.print_clusters()
     captured = capsys.readouterr()
     print(captured.out)
-    assert 'Document' in captured.out
+    assert "Document" in captured.out
     cluster.print_topics()
     captured = capsys.readouterr()
     print(captured.out)
-    assert 'topic' in captured.out
+    assert "topic" in captured.out
 
     print(cluster.build_lda_model())
 
@@ -81,8 +92,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster):
         "Text",
     ]
     print(df_dominant_topic.head(10))
-    assert 'Document_No' in df_dominant_topic.columns
+    assert "Document_No" in df_dominant_topic.columns
 
     df_sorted = cluster.most_representative_docs()
     print(df_sorted.head(10))
-    assert 'Dominant_Topic' in df_sorted.columns
\ No newline at end of file
+    assert "Dominant_Topic" in df_sorted.columns
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
index ae35c39..32d5e4e 100644
--- a/tests/test_visualize.py
+++ b/tests/test_visualize.py
@@ -2,14 +2,17 @@
 import pandas as pd
 from src.qrmine.visualize import QRVisualize
 
+
 @pytest.fixture
 def v():
     from pkg_resources import resource_filename
+
     file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv")
     data = pd.read_csv(file_path)
     _v = QRVisualize(data)
     return _v
 
+
 @pytest.fixture
 def topics():
     return [
@@ -90,17 +93,22 @@ def topics():
         ),
     ]
 
+
 def test_frequency_distribution_of_words(v, capsys):
-    v.plot_frequency_distribution_of_words(v.data, folder_path='/tmp/frequency_distribution.png')
+    v.plot_frequency_distribution_of_words(
+        v.data, folder_path="/tmp/frequency_distribution.png"
+    )
     captured = capsys.readouterr()
     print(captured.out)
 
+
 def test_distribution_by_topic(v, capsys):
-    v.plot_distribution_by_topic(v.data, folder_path='/tmp/distribution_by_topic.png')
+    v.plot_distribution_by_topic(v.data, folder_path="/tmp/distribution_by_topic.png")
     captured = capsys.readouterr()
     print(captured.out)
 
+
 def test_plot_wordcloud(v, topics, capsys):
-    v.plot_wordcloud(topics, folder_path='/tmp/wordcloud.png')
+    v.plot_wordcloud(topics, folder_path="/tmp/wordcloud.png")
     captured = capsys.readouterr()
     print(captured.out)

From 16c2af07dccc87c049cbd047e75879f794a07097 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 22:01:26 +0000
Subject: [PATCH 21/35] chore: update dependencies in requirements and
 dev-requirements files; upgrade filelock and jinja2 versions

---
 dev-requirements.txt |   6 +-
 requirements.txt     | 215 ++++++++++++++++++++++---------------------
 2 files changed, 112 insertions(+), 109 deletions(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index f36f95c..b5cc0fc 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -42,8 +42,9 @@ docutils==0.21.2
     # via
     #   recommonmark
     #   sphinx
-filelock==3.16.1
+filelock==3.18.0
     # via
+    #   -c requirements.txt
     #   tox
     #   virtualenv
 idna==3.10
@@ -54,7 +55,7 @@ imagesize==1.4.1
     # via sphinx
 iniconfig==2.0.0
     # via pytest
-jinja2==3.1.4
+jinja2==3.1.6
     # via
     #   -c requirements.txt
     #   sphinx
@@ -141,6 +142,5 @@ virtualenv==20.27.1
     # via tox
 wheel==0.45.0
     # via
-    #   -c requirements.txt
     #   -r dev-requirements.in
     #   pip-tools
diff --git a/requirements.txt b/requirements.txt
index 260d413..8326516 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,9 @@
 # This file was autogenerated by uv via the following command:
 #    uv pip compile setup.cfg -o requirements.txt --universal
-absl-py==2.1.0
-    # via
-    #   tensorboard
-    #   tensorflow
-astunparse==1.6.3
-    # via tensorflow
 blis==0.7.11
     # via thinc
 cachetools==5.5.0
-    # via
-    #   google-auth
-    #   textacy
+    # via textacy
 catalogue==2.0.10
     # via
     #   spacy
@@ -28,7 +20,7 @@ click==8.1.7
     #   typer
 cloudpathlib==0.20.0
     # via weasel
-colorama==0.4.6 ; sys_platform == 'win32' or platform_system == 'Windows'
+colorama==0.4.6 ; sys_platform == 'win32'
     # via
     #   click
     #   tqdm
@@ -48,28 +40,16 @@ cymem==2.0.8
     #   thinc
 cytoolz==1.0.0
     # via textacy
-flatbuffers==24.3.25
-    # via tensorflow
+filelock==3.18.0
+    # via torch
 floret==0.10.5
     # via textacy
 fonttools==4.54.1
     # via matplotlib
-gast==0.4.0
-    # via tensorflow
-google-auth==2.36.0
-    # via
-    #   google-auth-oauthlib
-    #   tensorboard
-google-auth-oauthlib==1.0.0
-    # via tensorboard
-google-pasta==0.2.0
-    # via tensorflow
-grpcio==1.67.1
-    # via
-    #   tensorboard
-    #   tensorflow
-h5py==3.12.1
-    # via tensorflow
+fsspec==2025.3.2
+    # via torch
+gensim==4.3.3
+    # via qrmine (setup.cfg)
 idna==3.10
     # via requests
 imbalanced-learn==0.12.4
@@ -77,103 +57,142 @@ imbalanced-learn==0.12.4
 jellyfish==1.1.0
     # via textacy
 jinja2==3.1.6
-    # via spacy
+    # via
+    #   spacy
+    #   torch
 joblib==1.4.2
     # via
     #   imbalanced-learn
     #   mlxtend
     #   scikit-learn
     #   textacy
-keras==2.13.1
-    # via tensorflow
 kiwisolver==1.4.7
     # via matplotlib
 langcodes==3.4.1
     # via spacy
 language-data==1.2.0
     # via langcodes
-libclang==18.1.1
-    # via tensorflow
 marisa-trie==1.2.1
     # via language-data
-markdown==3.7
-    # via tensorboard
 markdown-it-py==3.0.0
     # via rich
 markupsafe==3.0.2
-    # via
-    #   jinja2
-    #   werkzeug
+    # via jinja2
 matplotlib==3.9.2
     # via
     #   qrmine (setup.cfg)
     #   mlxtend
+    #   seaborn
+    #   wordcloud
 mdurl==0.1.2
     # via markdown-it-py
 mlxtend==0.23.2
     # via qrmine (setup.cfg)
+mpmath==1.3.0
+    # via sympy
 murmurhash==1.0.10
     # via
     #   preshed
     #   spacy
     #   thinc
 networkx==3.4.2
-    # via textacy
-numpy==1.24.3
+    # via
+    #   textacy
+    #   torch
+numpy==1.24.3 ; python_full_version < '3.12'
     # via
     #   blis
     #   contourpy
     #   floret
-    #   h5py
+    #   gensim
     #   imbalanced-learn
     #   matplotlib
     #   mlxtend
     #   pandas
     #   scikit-learn
     #   scipy
+    #   seaborn
     #   spacy
-    #   tensorboard
-    #   tensorflow
     #   textacy
     #   thinc
+    #   wordcloud
     #   xgboost
-nvidia-nccl-cu12==2.23.4 ; platform_machine != 'aarch64' and platform_system == 'Linux'
-    # via xgboost
-oauthlib==3.2.2
-    # via requests-oauthlib
-opt-einsum==3.4.0
-    # via tensorflow
+numpy==1.26.4 ; python_full_version >= '3.12'
+    # via
+    #   blis
+    #   contourpy
+    #   floret
+    #   gensim
+    #   imbalanced-learn
+    #   matplotlib
+    #   mlxtend
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+    #   spacy
+    #   textacy
+    #   thinc
+    #   wordcloud
+    #   xgboost
+nvidia-cublas-cu12==12.6.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.6.80 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-runtime-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cudnn-cu12==9.5.1.17 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufile-cu12==1.11.1.6 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu12==2.26.2 ; platform_machine != 'aarch64' and sys_platform == 'linux'
+    # via
+    #   torch
+    #   xgboost
+nvidia-nvjitlink-cu12==12.6.85 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
 packaging==24.2
     # via
     #   matplotlib
     #   spacy
-    #   tensorflow
     #   thinc
     #   weasel
-pandas==2.1.0 ; python_full_version >= '3.12'
-    # via
-    #   qrmine (setup.cfg)
-    #   mlxtend
-pandas==2.2.3 ; python_full_version < '3.12'
+pandas==2.2.3
     # via
     #   qrmine (setup.cfg)
     #   mlxtend
+    #   seaborn
 pillow==11.0.0
-    # via matplotlib
+    # via
+    #   matplotlib
+    #   wordcloud
 preshed==3.0.9
     # via
     #   spacy
     #   thinc
-protobuf==4.25.5
-    # via
-    #   tensorboard
-    #   tensorflow
-pyasn1==0.6.1
-    # via
-    #   pyasn1-modules
-    #   rsa
-pyasn1-modules==0.4.1
-    # via google-auth
 pydantic==1.10.19
     # via
     #   confection
@@ -184,6 +203,8 @@ pygments==2.18.0
     # via rich
 pyparsing==3.2.0
     # via matplotlib
+pypdf==5.4.0
+    # via qrmine (setup.cfg)
 pyphen==0.17.0
     # via textacy
 python-dateutil==2.9.0.post0
@@ -194,48 +215,44 @@ pytz==2024.2
     # via pandas
 requests==2.32.3
     # via
-    #   requests-oauthlib
+    #   qrmine (setup.cfg)
     #   spacy
-    #   tensorboard
     #   textacy
     #   vadersentiment
     #   weasel
-requests-oauthlib==2.0.0
-    # via google-auth-oauthlib
 rich==13.9.4
     # via typer
-rsa==4.9
-    # via google-auth
 scikit-learn==1.5.2
     # via
     #   qrmine (setup.cfg)
     #   imbalanced-learn
     #   mlxtend
     #   textacy
-scipy==1.14.1
+scipy==1.13.1
     # via
+    #   gensim
     #   imbalanced-learn
     #   mlxtend
     #   scikit-learn
     #   textacy
     #   xgboost
+seaborn==0.13.2
+    # via qrmine (setup.cfg)
 setuptools==75.3.0
     # via
     #   marisa-trie
     #   spacy
-    #   tensorboard
-    #   tensorflow
     #   thinc
+    #   torch
+    #   triton
 shellingham==1.5.4
     # via typer
 six==1.16.0
-    # via
-    #   astunparse
-    #   google-pasta
-    #   python-dateutil
-    #   tensorflow
+    # via python-dateutil
 smart-open==7.0.5
-    # via weasel
+    # via
+    #   gensim
+    #   weasel
 spacy==3.7.5
     # via
     #   qrmine (setup.cfg)
@@ -250,20 +267,8 @@ srsly==2.4.8
     #   spacy
     #   thinc
     #   weasel
-tensorboard==2.13.0
-    # via tensorflow
-tensorboard-data-server==0.7.2
-    # via tensorboard
-tensorflow==2.13.1
-    # via qrmine (setup.cfg)
-tensorflow-estimator==2.13.0
-    # via tensorflow
-tensorflow-io-gcs-filesystem==0.31.0
-    # via
-    #   qrmine (setup.cfg)
-    #   tensorflow
-termcolor==2.5.0
-    # via tensorflow
+sympy==1.14.0
+    # via torch
 textacy==0.13.0
     # via qrmine (setup.cfg)
 thinc==8.2.5
@@ -274,18 +279,22 @@ threadpoolctl==3.5.0
     #   scikit-learn
 toolz==1.0.0
     # via cytoolz
+torch==2.7.0
+    # via qrmine (setup.cfg)
 tqdm==4.67.0
     # via
     #   spacy
     #   textacy
+triton==3.3.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
 typer==0.13.0
     # via
     #   spacy
     #   weasel
-typing-extensions==4.5.0
+typing-extensions==4.13.2
     # via
     #   pydantic
-    #   tensorflow
+    #   torch
     #   typer
 tzdata==2024.2
     # via pandas
@@ -300,15 +309,9 @@ wasabi==1.1.3
     #   weasel
 weasel==0.4.1
     # via spacy
-werkzeug==3.1.3
-    # via tensorboard
-wheel==0.45.0
-    # via
-    #   astunparse
-    #   tensorboard
+wordcloud==1.9.4
+    # via qrmine (setup.cfg)
 wrapt==1.16.0
-    # via
-    #   smart-open
-    #   tensorflow
+    # via smart-open
 xgboost==2.1.2
     # via qrmine (setup.cfg)

From bd7e454cde7ce4255a62627cfae97d6b033aa0e7 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 19:28:05 -0500
Subject: [PATCH 22/35] Update pr.yml

---
 .github/workflows/pr.yml | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index f742724..25df3e7 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -13,27 +13,24 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: ["3.11"]
         os: [ubuntu-latest, macos-13, windows-latest]
     runs-on: ${{ matrix.os }}
     timeout-minutes: 20
     steps:
     - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
+    - name: "Set up Python"
+      uses: actions/setup-python@v5
       with:
-        python-version: ${{ matrix.python-version }}
-        cache: 'pip' # caching pip dependencies
+          python-version-file: "pyproject.toml"
     - name: run on mac
       if: startsWith(matrix.os, 'mac')
       run: |
         brew install libomp
-    - name: Install dependencies
+    - name: Install the project
       run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements.txt
+        uv sync --locked --all-extras --dev
         python -m spacy download en_core_web_sm
-    - name: Test with pytest
-      run: |
-        pip install pytest
-        pytest
+    - name: Run tests
+      run: uv run pytest tests

From 20d44cb200e5aeba65f447337ca71cded3315f18 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 00:31:50 +0000
Subject: [PATCH 23/35] fix: specify exact torch version and remove deprecated
 setup.cfg; update tox.ini to eliminate redundant dependencies

---
 pyproject.toml |  18 +++++-
 setup.cfg      | 159 -------------------------------------------------
 tox.ini        |   6 --
 3 files changed, 15 insertions(+), 168 deletions(-)
 delete mode 100644 setup.cfg

diff --git a/pyproject.toml b/pyproject.toml
index 03ef32a..d63ec84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "mlxtend",
     "spacy",
     "textacy",
-    "torch",
+    "torch==2.2.2",
     "pypdf",
     "requests",
     "gensim",
@@ -60,8 +60,6 @@ Documentation = "https://arxiv.org/abs/2003.13519"
 # Add here additional requirements for extra features, to install with:
 # `pip install qrmine[PDF]` like:
 # PDF = ReportLab; RXP
-gpu = ["torch[gpu]==2.1.1"]
-cpu = ["torch==2.1.1"]
 # Add here test requirements (semicolon/line-separated)
 testing = [
     "setuptools",
@@ -114,6 +112,19 @@ norecursedirs = [
     ".tox",
 ]
 
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cpu" },
+]
+torchvision = [
+  { index = "pytorch-cpu" },
+]
+
 [tool.aliases]
 release = "sdist bdist_wheel upload"
 
@@ -158,3 +169,4 @@ package = "qrmine"
 # Read more about the various options under:
 # https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
 # https://setuptools.pypa.io/en/latest/references/keywords.html
+
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index c85767c..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,159 +0,0 @@
-# This file is used to configure your project.
-# Read more about the various options under:
-# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
-# https://setuptools.pypa.io/en/latest/references/keywords.html
-
-[metadata]
-name = qrmine
-description = Qualitative Research support tools in Python!
-author = beapen
-author_email = github@gulfdoctor.net
-license = GPL-3.0-only
-# license_files = LICENSE.txt
-# long_description = file: README.rst
-# long_description_content_type = text/x-rst; charset=UTF-8
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/dermatologist/nlp-qrmine
-# Add here related links, for example:
-project_urls =
-    Documentation = https://arxiv.org/abs/2003.13519
-#    Source = https://github.com/pyscaffold/pyscaffold/
-#    Changelog = https://pyscaffold.org/en/latest/changelog.html
-#    Tracker = https://github.com/pyscaffold/pyscaffold/issues
-#    Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
-#    Download = https://pypi.org/project/PyScaffold/#files
-#    Twitter = https://twitter.com/PyScaffold
-
-# Change if running only on Windows, Mac or Linux (comma-separated)
-platforms = any
-
-# Add here all kinds of additional classifiers as defined under
-# https://pypi.org/classifiers/
-classifiers =
-    Intended Audience :: Science/Research
-    Development Status :: 4 - Beta
-    Operating System :: OS Independent
-    Programming Language :: Python :: 3.11
-    Topic :: Scientific/Engineering :: Information Analysis
-
-
-[options]
-zip_safe = False
-packages = find_namespace:
-include_package_data = True
-package_dir =
-    =src
-
-# Require a min/specific Python version (comma-separated conditions)
-# python_requires = >=3.8
-
-# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
-# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
-# new major versions. This works if the required packages follow Semantic Versioning.
-# For more information, check out https://semver.org/.
-install_requires =
-    importlib-metadata; python_version<"3.8"
-    pandas
-    matplotlib
-    click
-    scikit-learn
-    imbalanced-learn
-    vaderSentiment
-    xgboost
-    mlxtend
-    spacy
-    textacy
-    torch
-    pypdf
-    requests
-    gensim
-    seaborn
-    wordcloud
-
-[options.packages.find]
-where = src
-exclude =
-    tests
-
-[options.extras_require]
-# Add here additional requirements for extra features, to install with:
-# `pip install qrmine[PDF]` like:
-# PDF = ReportLab; RXP
-    gpu =
-        torch[gpu]==2.1.1
-    cpu =
-        torch==2.1.1
-# Add here test requirements (semicolon/line-separated)
-testing =
-    setuptools
-    pytest
-    pytest-cov
-
-[options.entry_points]
-# Add here console scripts like:
-# console_scripts =
-#     script_name = qrmine.module:function
-# For example:
-# console_scripts =
-#     fibonacci = qrmine.skeleton:run
-# And any other entry points, for example:
-# pyscaffold.cli =
-#     awesome = pyscaffoldext.awesome.extension:AwesomeExtension
-console_scripts =
-    qrmine = qrmine.main:main_routine
-
-[tool:pytest]
-# Specify command line options as you would do when invoking pytest directly.
-# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
-# in order to write a coverage file that can be read by Jenkins.
-# CAUTION: --cov flags may prohibit setting breakpoints while debugging.
-#          Comment those flags to avoid this pytest issue.
-addopts =
-    --verbose
-norecursedirs =
-    dist
-    build
-    .tox
-
-[aliases]
-release = sdist bdist_wheel upload
-
-[bdist_wheel]
-# Use this option if your package is pure-python
-universal = 1
-
-[build_sphinx]
-source_dir = docs
-build_dir = docs/_build
-
-testpaths = tests
-# Use pytest markers to select/deselect specific tests
-# markers =
-#     slow: mark tests as slow (deselect with '-m "not slow"')
-#     system: mark end-to-end system tests
-
-[devpi:upload]
-# Options for the devpi: PyPI server and packaging tool
-# VCS export must be deactivated since we are using setuptools-scm
-no_vcs = 1
-formats = bdist_wheel
-
-[flake8]
-# Some sane defaults for the code style checker flake8
-max_line_length = 88
-extend_ignore = E203, W503
-# ^  Black-compatible
-#    E203 and W503 have edge cases handled by black
-exclude =
-    .tox
-    build
-    dist
-    .eggs
-    docs/conf.py
-
-[pyscaffold]
-# PyScaffold's parameters when the project was created.
-# This will be used when updating. Do not change!
-version = 4.6
-package = qrmine
diff --git a/tox.ini b/tox.ini
index 3eb707d..dbb293d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,9 +8,6 @@ envlist = py311, integration
 
 [testenv]
 setenv = TOXINIDIR = {toxinidir}
-deps =
-    -rrequirements.txt
-    -rdev-requirements.txt
 commands =
     python -m spacy download en_core_web_sm
     py.test {posargs}
@@ -20,9 +17,6 @@ extras =
 
 [testenv:integration]
 setenv = TOXINIDIR = {toxinidir}
-deps =
-    -rrequirements.txt
-    -rdev-requirements.txt
 commands =
     python -m spacy download en_core_web_sm
     python qrminer.py
\ No newline at end of file

From 2c6c174c066c4e459d186405023f7f99050b2b43 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 19:35:08 -0500
Subject: [PATCH 24/35] Update pr.yml

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 25df3e7..fd1547a 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -30,7 +30,7 @@ jobs:
         brew install libomp
     - name: Install the project
       run: |
-        uv sync --locked --all-extras --dev
+        uv sync --all-extras --dev
         python -m spacy download en_core_web_sm
     - name: Run tests
       run: uv run pytest tests

From a551f368d251865112768f0b4062c887b0fbd703 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 00:41:21 +0000
Subject: [PATCH 25/35] fix: update python_requires and add missing entry point
 for qrmine

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index d63ec84..e65077e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
+python_requires = ">=3.11, <3.12"
 dependencies = [
     'importlib-metadata; python_version<"3.8"',
     "pandas",
@@ -77,6 +78,7 @@ testing = [
 # And any other entry points, for example:
 # pyscaffold.cli =
 # awesome = pyscaffoldext.awesome.extension:AwesomeExtension
+qrmine = "qrmine.main:main_routine"
 
 [project.scripts]
 qrmine = "qrmine.main:main_routine"

From 3efce4f360908cf410e3e38f9e184a73e88a8487 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 00:43:06 +0000
Subject: [PATCH 26/35] fix: update entry point syntax for qrmine in
 pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e65077e..a26c85b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,7 @@ testing = [
 # And any other entry points, for example:
 # pyscaffold.cli =
 # awesome = pyscaffoldext.awesome.extension:AwesomeExtension
-qrmine = "qrmine.main:main_routine"
+qrmine = qrmine.main:main_routine
 
 [project.scripts]
 qrmine = "qrmine.main:main_routine"

From dd0362258a69dfbc6f94a3f1e3df1c121b7495fe Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 00:45:44 +0000
Subject: [PATCH 27/35] fix: correct entry point syntax for console_scripts in
 pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a26c85b..2a18cf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,7 @@ testing = [
 # And any other entry points, for example:
 # pyscaffold.cli =
 # awesome = pyscaffoldext.awesome.extension:AwesomeExtension
-qrmine = qrmine.main:main_routine
+console_scripts = qrmine = qrmine.main:main_routine
 
 [project.scripts]
 qrmine = "qrmine.main:main_routine"

From 2081eeaff32749fb1a12914e5f6262e17de73112 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 00:47:07 +0000
Subject: [PATCH 28/35] fix: remove incorrect console_scripts entry from
 pyproject.toml

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2a18cf0..666ac4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,6 @@ testing = [
 # And any other entry points, for example:
 # pyscaffold.cli =
 # awesome = pyscaffoldext.awesome.extension:AwesomeExtension
-console_scripts = qrmine = qrmine.main:main_routine
 
 [project.scripts]
 qrmine = "qrmine.main:main_routine"

From d4f3553eff8b3600b7fe91206da2b27717222acf Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 00:48:39 +0000
Subject: [PATCH 29/35] fix: correct key name for Python version requirement in
 pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 666ac4e..fa7d5ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-python_requires = ">=3.11, <3.12"
+requires-python = ">=3.11, <3.12"
 dependencies = [
     'importlib-metadata; python_version<"3.8"',
     "pandas",

From b56923195f8c34990a2118c8a39886714c86b53b Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 19:52:00 -0500
Subject: [PATCH 30/35] Update pr.yml

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index fd1547a..b7d5f11 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -31,6 +31,6 @@ jobs:
     - name: Install the project
       run: |
         uv sync --all-extras --dev
-        python -m spacy download en_core_web_sm
+        uv run python -m spacy download en_core_web_sm
     - name: Run tests
       run: uv run pytest tests

From 97c80d9ef5fc89a726b36fa9e1c4a1609b20eae5 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 00:56:02 +0000
Subject: [PATCH 31/35] fix: add missing dependencies for build system in
 pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index fa7d5ed..ffebd89 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.2"]
+requires = ["setuptools>=61.2", "wheel", "pip"]
 build-backend = "setuptools.build_meta"
 
 [project]

From 8100027d1b3c928c1f4c056c8a985bc57d0e0f1a Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 19:58:56 -0500
Subject: [PATCH 32/35] Update pr.yml

---
 .github/workflows/pr.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index b7d5f11..082c154 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -31,6 +31,7 @@ jobs:
     - name: Install the project
       run: |
         uv sync --all-extras --dev
+        uv pip install pip
         uv run python -m spacy download en_core_web_sm
     - name: Run tests
       run: uv run pytest tests

From 7eb9df297fe93a83bf079d33e096fb207ff2022c Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:03:07 -0500
Subject: [PATCH 33/35] Update pr.yml

---
 .github/workflows/pr.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 082c154..0b69550 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -20,14 +20,16 @@ jobs:
     - uses: actions/checkout@v4
     - name: Install uv
       uses: astral-sh/setup-uv@v5
+      with:
+        enable-cache: true
     - name: "Set up Python"
       uses: actions/setup-python@v5
       with:
           python-version-file: "pyproject.toml"
-    - name: run on mac
-      if: startsWith(matrix.os, 'mac')
-      run: |
-        brew install libomp
+    # - name: run on mac
+    #   if: startsWith(matrix.os, 'mac')
+    #   run: |
+    #     brew install libomp
     - name: Install the project
       run: |
         uv sync --all-extras --dev

From 3965bb3e9282a8b445b9cbd5e8dd26b254e1eae1 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Wed, 30 Apr 2025 20:06:54 -0500
Subject: [PATCH 34/35] Update pr.yml

---
 .github/workflows/pr.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 0b69550..3693dc2 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -26,10 +26,10 @@ jobs:
       uses: actions/setup-python@v5
       with:
           python-version-file: "pyproject.toml"
-    # - name: run on mac
-    #   if: startsWith(matrix.os, 'mac')
-    #   run: |
-    #     brew install libomp
+    - name: run on mac
+      if: startsWith(matrix.os, 'mac')
+      run: |
+        brew install libomp
     - name: Install the project
       run: |
         uv sync --all-extras --dev

From b811c6a023735261dfda6aaa7107127c0c6dad13 Mon Sep 17 00:00:00 2001
From: Bell Eapen <github_public@gulfdoctor.net>
Date: Thu, 1 May 2025 01:32:10 +0000
Subject: [PATCH 35/35] refactor: remove unnecessary folder_path arguments from
 visualization tests

---
 dev-requirements.in     |  11 --
 dev-requirements.txt    | 146 ------------------
 notes/new-process.md    |  34 +++++
 pyproject.toml          |  14 ++
 requirements.txt        | 317 ----------------------------------------
 tests/test_visualize.py |   6 +-
 6 files changed, 51 insertions(+), 477 deletions(-)
 delete mode 100644 dev-requirements.in
 delete mode 100644 dev-requirements.txt
 create mode 100644 notes/new-process.md
 delete mode 100644 requirements.txt

diff --git a/dev-requirements.in b/dev-requirements.in
deleted file mode 100644
index 2b56355..0000000
--- a/dev-requirements.in
+++ /dev/null
@@ -1,11 +0,0 @@
-# dev-requirements.in
--c requirements.txt
-pytest-cov
-pytest
-recommonmark
-sphinx>=3.2.1
-setuptools
-setuptools_scm
-wheel>=0.37.0 # conflicts with dependency of tensorflow
-tox
-pip-tools
\ No newline at end of file
diff --git a/dev-requirements.txt b/dev-requirements.txt
deleted file mode 100644
index b5cc0fc..0000000
--- a/dev-requirements.txt
+++ /dev/null
@@ -1,146 +0,0 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile dev-requirements.in -o dev-requirements.txt --universal
-alabaster==1.0.0
-    # via sphinx
-babel==2.16.0
-    # via sphinx
-build==1.2.2.post1
-    # via pip-tools
-cachetools==5.5.0
-    # via
-    #   -c requirements.txt
-    #   tox
-certifi==2024.8.30
-    # via
-    #   -c requirements.txt
-    #   requests
-chardet==5.2.0
-    # via tox
-charset-normalizer==3.4.0
-    # via
-    #   -c requirements.txt
-    #   requests
-click==8.1.7
-    # via
-    #   -c requirements.txt
-    #   pip-tools
-colorama==0.4.6
-    # via
-    #   -c requirements.txt
-    #   build
-    #   click
-    #   pytest
-    #   sphinx
-    #   tox
-commonmark==0.9.1
-    # via recommonmark
-coverage==7.6.4
-    # via pytest-cov
-distlib==0.3.9
-    # via virtualenv
-docutils==0.21.2
-    # via
-    #   recommonmark
-    #   sphinx
-filelock==3.18.0
-    # via
-    #   -c requirements.txt
-    #   tox
-    #   virtualenv
-idna==3.10
-    # via
-    #   -c requirements.txt
-    #   requests
-imagesize==1.4.1
-    # via sphinx
-iniconfig==2.0.0
-    # via pytest
-jinja2==3.1.6
-    # via
-    #   -c requirements.txt
-    #   sphinx
-markupsafe==3.0.2
-    # via
-    #   -c requirements.txt
-    #   jinja2
-packaging==24.2
-    # via
-    #   -c requirements.txt
-    #   build
-    #   pyproject-api
-    #   pytest
-    #   setuptools-scm
-    #   sphinx
-    #   tox
-pip==24.3.1
-    # via pip-tools
-pip-tools==7.4.1
-    # via -r dev-requirements.in
-platformdirs==4.3.6
-    # via
-    #   tox
-    #   virtualenv
-pluggy==1.5.0
-    # via
-    #   pytest
-    #   tox
-pygments==2.18.0
-    # via
-    #   -c requirements.txt
-    #   sphinx
-pyproject-api==1.8.0
-    # via tox
-pyproject-hooks==1.2.0
-    # via
-    #   build
-    #   pip-tools
-pytest==8.3.3
-    # via
-    #   -r dev-requirements.in
-    #   pytest-cov
-pytest-cov==6.0.0
-    # via -r dev-requirements.in
-recommonmark==0.7.1
-    # via -r dev-requirements.in
-requests==2.32.3
-    # via
-    #   -c requirements.txt
-    #   sphinx
-setuptools==75.3.0
-    # via
-    #   -c requirements.txt
-    #   -r dev-requirements.in
-    #   pip-tools
-    #   setuptools-scm
-setuptools-scm==8.1.0
-    # via -r dev-requirements.in
-snowballstemmer==2.2.0
-    # via sphinx
-sphinx==8.1.3
-    # via
-    #   -r dev-requirements.in
-    #   recommonmark
-sphinxcontrib-applehelp==2.0.0
-    # via sphinx
-sphinxcontrib-devhelp==2.0.0
-    # via sphinx
-sphinxcontrib-htmlhelp==2.1.0
-    # via sphinx
-sphinxcontrib-jsmath==1.0.1
-    # via sphinx
-sphinxcontrib-qthelp==2.0.0
-    # via sphinx
-sphinxcontrib-serializinghtml==2.0.0
-    # via sphinx
-tox==4.23.2
-    # via -r dev-requirements.in
-urllib3==2.2.3
-    # via
-    #   -c requirements.txt
-    #   requests
-virtualenv==20.27.1
-    # via tox
-wheel==0.45.0
-    # via
-    #   -r dev-requirements.in
-    #   pip-tools
diff --git a/notes/new-process.md b/notes/new-process.md
new file mode 100644
index 0000000..1ead749
--- /dev/null
+++ b/notes/new-process.md
@@ -0,0 +1,34 @@
+conda install conda-forge::uv
+uv pip install ini2toml
+ini2toml setup.cfg -o pyproject.toml
+
+delete setup.cpg
+delete requirements.txt, dev-requirements.txt, dev-requirements.in
+remove deps from tox.ini
+
+uv pip install -e .
+see pr.yml for GitHub actions
+see pyproject.toml for pytorch cpu install
+uv pip install -e .
+
+uv sync --all-extras --dev
+uv pip install pip
+uv run python -m spacy download en_core_web_sm
+
+pyproject.toml
+requires = ["setuptools>=61.2", "wheel", "pip"]
+
+dev = [
+    "setuptools",
+    "setuptools_scm",
+    "pytest",
+    "pytest-cov",
+    "tox",
+    "black",
+    "recommonmark",
+    "sphinx",
+    "wheel",
+    "twine",
+    "tox",
+]
+
diff --git a/pyproject.toml b/pyproject.toml
index ffebd89..9fc3688 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,6 +68,20 @@ testing = [
     "pytest-cov",
 ]
 
+dev = [
+    "setuptools",
+    "setuptools_scm",
+    "pytest",
+    "pytest-cov",
+    "tox",
+    "black",
+    "recommonmark",
+    "sphinx",
+    "wheel",
+    "twine",
+    "tox",
+]
+
 [project.entry-points]
 # Add here console scripts like:
 # console_scripts =
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 8326516..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,317 +0,0 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile setup.cfg -o requirements.txt --universal
-blis==0.7.11
-    # via thinc
-cachetools==5.5.0
-    # via textacy
-catalogue==2.0.10
-    # via
-    #   spacy
-    #   srsly
-    #   textacy
-    #   thinc
-certifi==2024.8.30
-    # via requests
-charset-normalizer==3.4.0
-    # via requests
-click==8.1.7
-    # via
-    #   qrmine (setup.cfg)
-    #   typer
-cloudpathlib==0.20.0
-    # via weasel
-colorama==0.4.6 ; sys_platform == 'win32'
-    # via
-    #   click
-    #   tqdm
-    #   wasabi
-confection==0.1.5
-    # via
-    #   thinc
-    #   weasel
-contourpy==1.3.0
-    # via matplotlib
-cycler==0.12.1
-    # via matplotlib
-cymem==2.0.8
-    # via
-    #   preshed
-    #   spacy
-    #   thinc
-cytoolz==1.0.0
-    # via textacy
-filelock==3.18.0
-    # via torch
-floret==0.10.5
-    # via textacy
-fonttools==4.54.1
-    # via matplotlib
-fsspec==2025.3.2
-    # via torch
-gensim==4.3.3
-    # via qrmine (setup.cfg)
-idna==3.10
-    # via requests
-imbalanced-learn==0.12.4
-    # via qrmine (setup.cfg)
-jellyfish==1.1.0
-    # via textacy
-jinja2==3.1.6
-    # via
-    #   spacy
-    #   torch
-joblib==1.4.2
-    # via
-    #   imbalanced-learn
-    #   mlxtend
-    #   scikit-learn
-    #   textacy
-kiwisolver==1.4.7
-    # via matplotlib
-langcodes==3.4.1
-    # via spacy
-language-data==1.2.0
-    # via langcodes
-marisa-trie==1.2.1
-    # via language-data
-markdown-it-py==3.0.0
-    # via rich
-markupsafe==3.0.2
-    # via jinja2
-matplotlib==3.9.2
-    # via
-    #   qrmine (setup.cfg)
-    #   mlxtend
-    #   seaborn
-    #   wordcloud
-mdurl==0.1.2
-    # via markdown-it-py
-mlxtend==0.23.2
-    # via qrmine (setup.cfg)
-mpmath==1.3.0
-    # via sympy
-murmurhash==1.0.10
-    # via
-    #   preshed
-    #   spacy
-    #   thinc
-networkx==3.4.2
-    # via
-    #   textacy
-    #   torch
-numpy==1.24.3 ; python_full_version < '3.12'
-    # via
-    #   blis
-    #   contourpy
-    #   floret
-    #   gensim
-    #   imbalanced-learn
-    #   matplotlib
-    #   mlxtend
-    #   pandas
-    #   scikit-learn
-    #   scipy
-    #   seaborn
-    #   spacy
-    #   textacy
-    #   thinc
-    #   wordcloud
-    #   xgboost
-numpy==1.26.4 ; python_full_version >= '3.12'
-    # via
-    #   blis
-    #   contourpy
-    #   floret
-    #   gensim
-    #   imbalanced-learn
-    #   matplotlib
-    #   mlxtend
-    #   pandas
-    #   scikit-learn
-    #   scipy
-    #   seaborn
-    #   spacy
-    #   textacy
-    #   thinc
-    #   wordcloud
-    #   xgboost
-nvidia-cublas-cu12==12.6.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cuda-cupti-cu12==12.6.80 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cuda-runtime-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cudnn-cu12==9.5.1.17 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cufile-cu12==1.11.1.6 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-nvidia-nccl-cu12==2.26.2 ; platform_machine != 'aarch64' and sys_platform == 'linux'
-    # via
-    #   torch
-    #   xgboost
-nvidia-nvjitlink-cu12==12.6.85 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via
-    #   nvidia-cufft-cu12
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
-    #   torch
-nvidia-nvtx-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-packaging==24.2
-    # via
-    #   matplotlib
-    #   spacy
-    #   thinc
-    #   weasel
-pandas==2.2.3
-    # via
-    #   qrmine (setup.cfg)
-    #   mlxtend
-    #   seaborn
-pillow==11.0.0
-    # via
-    #   matplotlib
-    #   wordcloud
-preshed==3.0.9
-    # via
-    #   spacy
-    #   thinc
-pydantic==1.10.19
-    # via
-    #   confection
-    #   spacy
-    #   thinc
-    #   weasel
-pygments==2.18.0
-    # via rich
-pyparsing==3.2.0
-    # via matplotlib
-pypdf==5.4.0
-    # via qrmine (setup.cfg)
-pyphen==0.17.0
-    # via textacy
-python-dateutil==2.9.0.post0
-    # via
-    #   matplotlib
-    #   pandas
-pytz==2024.2
-    # via pandas
-requests==2.32.3
-    # via
-    #   qrmine (setup.cfg)
-    #   spacy
-    #   textacy
-    #   vadersentiment
-    #   weasel
-rich==13.9.4
-    # via typer
-scikit-learn==1.5.2
-    # via
-    #   qrmine (setup.cfg)
-    #   imbalanced-learn
-    #   mlxtend
-    #   textacy
-scipy==1.13.1
-    # via
-    #   gensim
-    #   imbalanced-learn
-    #   mlxtend
-    #   scikit-learn
-    #   textacy
-    #   xgboost
-seaborn==0.13.2
-    # via qrmine (setup.cfg)
-setuptools==75.3.0
-    # via
-    #   marisa-trie
-    #   spacy
-    #   thinc
-    #   torch
-    #   triton
-shellingham==1.5.4
-    # via typer
-six==1.16.0
-    # via python-dateutil
-smart-open==7.0.5
-    # via
-    #   gensim
-    #   weasel
-spacy==3.7.5
-    # via
-    #   qrmine (setup.cfg)
-    #   textacy
-spacy-legacy==3.0.12
-    # via spacy
-spacy-loggers==1.0.5
-    # via spacy
-srsly==2.4.8
-    # via
-    #   confection
-    #   spacy
-    #   thinc
-    #   weasel
-sympy==1.14.0
-    # via torch
-textacy==0.13.0
-    # via qrmine (setup.cfg)
-thinc==8.2.5
-    # via spacy
-threadpoolctl==3.5.0
-    # via
-    #   imbalanced-learn
-    #   scikit-learn
-toolz==1.0.0
-    # via cytoolz
-torch==2.7.0
-    # via qrmine (setup.cfg)
-tqdm==4.67.0
-    # via
-    #   spacy
-    #   textacy
-triton==3.3.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
-    # via torch
-typer==0.13.0
-    # via
-    #   spacy
-    #   weasel
-typing-extensions==4.13.2
-    # via
-    #   pydantic
-    #   torch
-    #   typer
-tzdata==2024.2
-    # via pandas
-urllib3==2.2.3
-    # via requests
-vadersentiment==3.3.2
-    # via qrmine (setup.cfg)
-wasabi==1.1.3
-    # via
-    #   spacy
-    #   thinc
-    #   weasel
-weasel==0.4.1
-    # via spacy
-wordcloud==1.9.4
-    # via qrmine (setup.cfg)
-wrapt==1.16.0
-    # via smart-open
-xgboost==2.1.2
-    # via qrmine (setup.cfg)
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
index 32d5e4e..41f7145 100644
--- a/tests/test_visualize.py
+++ b/tests/test_visualize.py
@@ -96,19 +96,19 @@ def topics():
 
 def test_frequency_distribution_of_words(v, capsys):
     v.plot_frequency_distribution_of_words(
-        v.data, folder_path="/tmp/frequency_distribution.png"
+        v.data
     )
     captured = capsys.readouterr()
     print(captured.out)
 
 
 def test_distribution_by_topic(v, capsys):
-    v.plot_distribution_by_topic(v.data, folder_path="/tmp/distribution_by_topic.png")
+    v.plot_distribution_by_topic(v.data)
     captured = capsys.readouterr()
     print(captured.out)
 
 
 def test_plot_wordcloud(v, topics, capsys):
-    v.plot_wordcloud(topics, folder_path="/tmp/wordcloud.png")
+    v.plot_wordcloud(topics)
     captured = capsys.readouterr()
     print(captured.out)