From d907d5315204818bb41e28b6abf0e14df0597228 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 15:20:40 +0000 Subject: [PATCH 01/35] chore: update .gitignore and add conda setup instructions --- .gitignore | 1 + notes/conda.md | 8 ++++ pyproject.toml | 104 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 107 insertions(+), 6 deletions(-) create mode 100644 notes/conda.md diff --git a/.gitignore b/.gitignore index 64049e7..c29a2a9 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ __pycache__/* .idea .venv conda +uv.lock # Package files *.egg diff --git a/notes/conda.md b/notes/conda.md new file mode 100644 index 0000000..49d024a --- /dev/null +++ b/notes/conda.md @@ -0,0 +1,8 @@ +conda create --name qrmine python=3.11 +conda activate qrmine + +conda install conda-forge::uv +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml +uv pip install pandas matplotlib click scikit-learn imbalanced-learn vaderSentiment xgboost mlxtend spacy textacy tensorflow==2.13.1 tensorflow-io-gcs-filesystem==0.31.0 pytest tox +python -m spacy download en_core_web_sm \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 89a5bed..dcf5188 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,101 @@ [build-system] -# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! -requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"] +requires = ["setuptools>=61.2"] build-backend = "setuptools.build_meta" -[tool.setuptools_scm] -# For smarter version schemes and other configuration options, -# check out https://github.com/pypa/setuptools_scm -version_scheme = "no-guess-dev" +[project] +name = "qrmine" +description = "Qualitative Research support tools in Python!" +authors = [{name = "beapen", email = "github@gulfdoctor.net"}] +license = {text = "GPL-3.0-only"} +classifiers = [ + "Intended Audience :: Science/Research", + "Development Status :: 4 - Beta", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Information Analysis", +] +dependencies = [ + 'importlib-metadata; python_version<"3.8"', + "pandas", + "matplotlib", + "click", + "scikit-learn", + "imbalanced-learn", + "vadersentiment", + "xgboost", + "mlxtend", + "spacy", + "textacy", + "tensorflow==2.13.1", + "tensorflow-io-gcs-filesystem==0.31.0", + "pytest>=8.3.5", + "tox>=4.25.0", +] +dynamic = ["version"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" + +[project.urls] +Homepage = "https://github.com/dermatologist/nlp-qrmine" +Documentation = "https://arxiv.org/abs/2003.13519" + +[project.optional-dependencies] +testing = [ + "setuptools", + "pytest", + "pytest-cov", +] + +[project.scripts] +qrmine = "qrmine.main:main_routine" + +[tool.setuptools] +zip-safe = false +include-package-data = true +package-dir = {"" = "src"} +platforms = ["any"] + +[tool.setuptools.packages.find] +where = ["src"] +exclude = ["tests"] +namespaces = true + +[tool.pytest.ini_options] +addopts = """ +--verbose""" +norecursedirs = [ + "dist", + "build", + ".tox", +] + +[tool.aliases] +release = "sdist bdist_wheel upload" + +[tool.distutils.bdist_wheel] +universal = 1 + +[tool.build_sphinx] +source_dir = "docs" +build_dir = "docs/_build" +testpaths = "tests" + +[tool.devpi.upload] +no_vcs = "1" +formats = "bdist_wheel" + +[tool.flake8] +max_line_length = "88" +extend_ignore = "E203, W503" +exclude = """ +.tox +build +dist +.eggs +docs/conf.py""" + +[tool.pyscaffold] +version = "4.6" +package = "qrmine" From 3238e5d9c43cfef7f64b12411796fd39f8b5c85d Mon Sep 17 00:00:00 2001 From: dermatologist Date: Wed, 30 Apr 2025 11:02:05 -0500 Subject: [PATCH 02/35] feat: implement neural network model using PyTorch and update tests for accuracy output --- notes/conda.md | 6 ++- setup.cfg | 3 +- src/qrmine/mlqrmine.py | 88 +++++++++++++++++++++++++++++++++--------- tests/test_num.py | 4 +- 4 files changed, 77 insertions(+), 24 deletions(-) diff --git a/notes/conda.md b/notes/conda.md index 49d024a..c0af730 100644 --- a/notes/conda.md +++ b/notes/conda.md @@ -5,4 +5,8 @@ conda install conda-forge::uv uv pip install ini2toml ini2toml setup.cfg -o pyproject.toml uv pip install pandas matplotlib click scikit-learn imbalanced-learn vaderSentiment xgboost mlxtend spacy textacy tensorflow==2.13.1 tensorflow-io-gcs-filesystem==0.31.0 pytest tox -python -m spacy download en_core_web_sm \ No newline at end of file +python -m spacy download en_core_web_sm + + + +pip3 install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index e6953b9..7655d02 100644 --- a/setup.cfg +++ b/setup.cfg @@ -64,8 +64,7 @@ install_requires = mlxtend spacy textacy - tensorflow<=2.13.1 - tensorflow-io-gcs-filesystem<=0.31.0 + torch [options.packages.find] where = src diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py index 12b75a3..fcfac7a 100644 --- a/src/qrmine/mlqrmine.py +++ b/src/qrmine/mlqrmine.py @@ -1,13 +1,10 @@ import numpy from imblearn.over_sampling import RandomOverSampler -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense -from numpy import random, argsort, sqrt, array, ones from pandas import read_csv from sklearn.cluster import KMeans from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.neighbors import KDTree @@ -17,6 +14,25 @@ from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules +import torch.nn as nn +import torch.optim as optim +import torch +from torch.utils.data import DataLoader, TensorDataset +class NeuralNet(nn.Module): + def __init__(self, input_dim): + super(NeuralNet, self).__init__() + self.fc1 = nn.Linear(input_dim, 12) + self.fc2 = nn.Linear(12, 8) + self.fc3 = nn.Linear(8, 1) + self.relu = nn.ReLU() + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.sigmoid(self.fc3(x)) + return x + class MLQRMine(object): @@ -24,13 +40,13 @@ def __init__(self): self._seed = randint(1, 9) self._csvfile = "" self._titles = None + self._model = None self._dataset = None self._X = None self._y = None self._X_original = None self._y_original = None self._dataset_original = None - self._model = Sequential() self._sc = StandardScaler() self._vnum = 0 # Number of variables self._classifier = XGBClassifier() @@ -147,22 +163,57 @@ def prepare_data(self, oversample=False): self.oversample() def get_nnet_predictions(self): - self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu')) - self._model.add(Dense(8, kernel_initializer='uniform', activation='relu')) - self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) - # Compile model - self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) - # Fit the model - self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2) - - # calculate predictions - predictions = self._model.predict(self._X_original) - # round predictions - rounded = [round(x[0]) for x in predictions] + + self._model = NeuralNet(self._vnum) + criterion = nn.BCELoss() + optimizer = optim.Adam(self._model.parameters(), lr=0.001) + + # Convert data to PyTorch tensors + X_tensor = torch.tensor(self._X, dtype=torch.float32) + y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1) + + # Create a dataset and data loader + dataset = TensorDataset(X_tensor, y_tensor) + dataloader = DataLoader(dataset, batch_size=10, shuffle=True) + + # Train the model + for epoch in range(self._epochs): + for batch_X, batch_y in dataloader: + optimizer.zero_grad() + outputs = self._model(batch_X) + loss = criterion(outputs, batch_y) + loss.backward() + optimizer.step() + + # Calculate predictions + with torch.no_grad(): + predictions = self._model(torch.tensor(self._X_original, dtype=torch.float32)) + rounded = [round(x.item()) for x in predictions] + # print("Predictions: ", rounded) + # Calculate accuracy + correct = sum([1 for i in range(len(rounded)) if rounded[i] == self._y_original[i]]) + total = len(rounded) + accuracy = correct / total + print(f'Accuracy: {accuracy * 100:.2f}%') return rounded def get_nnet_scores(self): - return self._model.evaluate(self._X, self._y) + # evalute the pytorch model + self._model.eval() + X_tensor = torch.tensor(self._X, dtype=torch.float32) + y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1) + dataset = TensorDataset(X_tensor, y_tensor) + dataloader = DataLoader(dataset, batch_size=10, shuffle=True) + correct = 0 + total = 0 + with torch.no_grad(): + for batch_X, batch_y in dataloader: + outputs = self._model(batch_X) + predicted = (outputs > 0.5).float() + total += batch_y.size(0) + correct += (predicted == batch_y).sum().item() + accuracy = correct / total + print(f'Accuracy: {accuracy * 100:.2f}%') def svm_confusion_matrix(self): """Generate confusion matrix for SVM @@ -211,7 +262,6 @@ def get_centroids(self, c=1): print("Mean") print(self._dataset.iloc[cluster_list, :].mean(axis=0)) - """ TODO: This is not working yet. use the ColumnTransformer instead of categorical_features diff --git a/tests/test_num.py b/tests/test_num.py index f0c53cd..ac7a139 100644 --- a/tests/test_num.py +++ b/tests/test_num.py @@ -9,7 +9,7 @@ def ml_fixture(): ml = MLQRMine() file_path = resource_filename('src.qrmine.resources', 'numeric.csv') ml.csvfile = file_path - return ml + return ml @@ -19,7 +19,7 @@ def test_nn(ml_fixture, capsys): ml_fixture.prepare_data(True) ml_fixture.get_nnet_predictions() captured = capsys.readouterr() - assert 'accuracy' in captured.out + assert 'Accuracy' in captured.out def test_svm(ml_fixture, capsys): ml_fixture.prepare_data(True) From 7ea669e14bf21c0c433db21f2628aa541214e883 Mon Sep 17 00:00:00 2001 From: dermatologist Date: Wed, 30 Apr 2025 11:09:05 -0500 Subject: [PATCH 03/35] chore: update pyproject.toml and setup.cfg to organize dependencies and improve structure --- pyproject.toml | 103 ++++++++++++++++++------------------------------- setup.cfg | 5 ++- 2 files changed, 42 insertions(+), 66 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dcf5188..f80f403 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,37 +1,20 @@ [build-system] -requires = ["setuptools>=61.2"] +requires = [ "setuptools>=61.2",] build-backend = "setuptools.build_meta" [project] name = "qrmine" description = "Qualitative Research support tools in Python!" -authors = [{name = "beapen", email = "github@gulfdoctor.net"}] -license = {text = "GPL-3.0-only"} -classifiers = [ - "Intended Audience :: Science/Research", - "Development Status :: 4 - Beta", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.11", - "Topic :: Scientific/Engineering :: Information Analysis", -] -dependencies = [ - 'importlib-metadata; python_version<"3.8"', - "pandas", - "matplotlib", - "click", - "scikit-learn", - "imbalanced-learn", - "vadersentiment", - "xgboost", - "mlxtend", - "spacy", - "textacy", - "tensorflow==2.13.1", - "tensorflow-io-gcs-filesystem==0.31.0", - "pytest>=8.3.5", - "tox>=4.25.0", -] -dynamic = ["version"] +classifiers = [ "Intended Audience :: Science/Research", "Development Status :: 4 - Beta", "Operating System :: OS Independent", "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Information Analysis",] +dependencies = [ "importlib-metadata; python_version<\"3.8\"", "pandas", "matplotlib", "click", "scikit-learn", "imbalanced-learn", "vaderSentiment", "xgboost", "mlxtend", "spacy", "textacy", "torch",] +dynamic = [ "version",] + +[[project.authors]] +name = "beapen" +email = "github@gulfdoctor.net" + +[project.license] +text = "GPL-3.0-only" [project.readme] file = "README.md" @@ -42,11 +25,9 @@ Homepage = "https://github.com/dermatologist/nlp-qrmine" Documentation = "https://arxiv.org/abs/2003.13519" [project.optional-dependencies] -testing = [ - "setuptools", - "pytest", - "pytest-cov", -] +gpu = [ "torch[gpu]==2.1.1",] +cpu = [ "torch==2.1.1",] +testing = [ "setuptools", "pytest", "pytest-cov",] [project.scripts] qrmine = "qrmine.main:main_routine" @@ -54,48 +35,40 @@ qrmine = "qrmine.main:main_routine" [tool.setuptools] zip-safe = false include-package-data = true -package-dir = {"" = "src"} -platforms = ["any"] - -[tool.setuptools.packages.find] -where = ["src"] -exclude = ["tests"] -namespaces = true - -[tool.pytest.ini_options] -addopts = """ ---verbose""" -norecursedirs = [ - "dist", - "build", - ".tox", -] +platforms = [ "any",] [tool.aliases] release = "sdist bdist_wheel upload" +[tool.flake8] +max_line_length = "88" +extend_ignore = "E203, W503" +exclude = "\n.tox\nbuild\ndist\n.eggs\ndocs/conf.py" + +[tool.pyscaffold] +version = "4.6" +package = "qrmine" + +[tool.setuptools.package-dir] +"" = "src" + +[tool.pytest.ini_options] +addopts = "\n--verbose" +norecursedirs = [ "dist", "build", ".tox",] + [tool.distutils.bdist_wheel] universal = 1 -[tool.build_sphinx] -source_dir = "docs" -build_dir = "docs/_build" +[tool.distutils.build_sphinx] +source-dir = "docs" +build-dir = "docs/_build" testpaths = "tests" [tool.devpi.upload] no_vcs = "1" formats = "bdist_wheel" -[tool.flake8] -max_line_length = "88" -extend_ignore = "E203, W503" -exclude = """ -.tox -build -dist -.eggs -docs/conf.py""" - -[tool.pyscaffold] -version = "4.6" -package = "qrmine" +[tool.setuptools.packages.find] +where = [ "src",] +exclude = [ "tests",] +namespaces = true diff --git a/setup.cfg b/setup.cfg index 7655d02..dba42c2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -75,7 +75,10 @@ exclude = # Add here additional requirements for extra features, to install with: # `pip install qrmine[PDF]` like: # PDF = ReportLab; RXP - + gpu = + torch[gpu]==2.1.1 + cpu = + torch==2.1.1 # Add here test requirements (semicolon/line-separated) testing = setuptools From 0ada4d695c32811ba5219b22f70634710b293b5e Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 16:26:59 +0000 Subject: [PATCH 04/35] chore: update conda.md to simplify package installation command --- notes/conda.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notes/conda.md b/notes/conda.md index c0af730..79eb6c8 100644 --- a/notes/conda.md +++ b/notes/conda.md @@ -4,7 +4,7 @@ conda activate qrmine conda install conda-forge::uv uv pip install ini2toml ini2toml setup.cfg -o pyproject.toml -uv pip install pandas matplotlib click scikit-learn imbalanced-learn vaderSentiment xgboost mlxtend spacy textacy tensorflow==2.13.1 tensorflow-io-gcs-filesystem==0.31.0 pytest tox +uv pip install -e . python -m spacy download en_core_web_sm From 6f0fe38c05da30a1ec1f35d68136963ccee947f6 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 16:57:19 +0000 Subject: [PATCH 05/35] feat: enhance ReadData class to support reading from URLs and folders, update tests for consistency --- notes/pip-tools.md | 5 +- pyproject.toml | 159 ++++++++++++++++++++++++++++++---------- setup.cfg | 2 + src/qrmine/readfiles.py | 92 +++++++++++++---------- tests/test_nlp.py | 2 +- tests/test_readfiles.py | 4 +- 6 files changed, 185 insertions(+), 79 deletions(-) diff --git a/notes/pip-tools.md b/notes/pip-tools.md index da4baa4..c504a1e 100644 --- a/notes/pip-tools.md +++ b/notes/pip-tools.md @@ -21,4 +21,7 @@ OR * pip install uv * uv pip compile setup.cfg -o requirements.txt --universal -* uv pip compile dev-requirements.in -o dev-requirements.txt --universal \ No newline at end of file +* uv pip compile dev-requirements.in -o dev-requirements.txt --universal + +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f80f403..abba09a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,33 +1,81 @@ [build-system] -requires = [ "setuptools>=61.2",] +requires = ["setuptools>=61.2"] build-backend = "setuptools.build_meta" [project] name = "qrmine" description = "Qualitative Research support tools in Python!" -classifiers = [ "Intended Audience :: Science/Research", "Development Status :: 4 - Beta", "Operating System :: OS Independent", "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Information Analysis",] -dependencies = [ "importlib-metadata; python_version<\"3.8\"", "pandas", "matplotlib", "click", "scikit-learn", "imbalanced-learn", "vaderSentiment", "xgboost", "mlxtend", "spacy", "textacy", "torch",] -dynamic = [ "version",] - -[[project.authors]] -name = "beapen" -email = "github@gulfdoctor.net" - -[project.license] -text = "GPL-3.0-only" +authors = [{name = "beapen", email = "github@gulfdoctor.net"}] +license = {text = "GPL-3.0-only"} +# license_files = LICENSE.txt +# long_description = file: README.rst +# long_description_content_type = text/x-rst; charset=UTF-8 +classifiers = [ + "Intended Audience :: Science/Research", + "Development Status :: 4 - Beta", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Information Analysis", +] +dependencies = [ + 'importlib-metadata; python_version<"3.8"', + "pandas", + "matplotlib", + "click", + "scikit-learn", + "imbalanced-learn", + "vaderSentiment", + "xgboost", + "mlxtend", + "spacy", + "textacy", + "torch", + "pypdf", + "requests", +] +dynamic = ["version"] [project.readme] file = "README.md" content-type = "text/markdown" +# Add here related links, for example: [project.urls] Homepage = "https://github.com/dermatologist/nlp-qrmine" Documentation = "https://arxiv.org/abs/2003.13519" +# Source = https://github.com/pyscaffold/pyscaffold/ +# Changelog = https://pyscaffold.org/en/latest/changelog.html +# Tracker = https://github.com/pyscaffold/pyscaffold/issues +# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold +# Download = https://pypi.org/project/PyScaffold/#files +# Twitter = https://twitter.com/PyScaffold +# Change if running only on Windows, Mac or Linux (comma-separated) +# Add here all kinds of additional classifiers as defined under +# https://pypi.org/classifiers/ [project.optional-dependencies] -gpu = [ "torch[gpu]==2.1.1",] -cpu = [ "torch==2.1.1",] -testing = [ "setuptools", "pytest", "pytest-cov",] +# Add here additional requirements for extra features, to install with: +# `pip install qrmine[PDF]` like: +# PDF = ReportLab; RXP +gpu = ["torch[gpu]==2.1.1"] +cpu = ["torch==2.1.1"] +# Add here test requirements (semicolon/line-separated) +testing = [ + "setuptools", + "pytest", + "pytest-cov", +] + +[project.entry-points] +# Add here console scripts like: +# console_scripts = +# script_name = qrmine.module:function +# For example: +# console_scripts = +# fibonacci = qrmine.skeleton:run +# And any other entry points, for example: +# pyscaffold.cli = +# awesome = pyscaffoldext.awesome.extension:AwesomeExtension [project.scripts] qrmine = "qrmine.main:main_routine" @@ -35,40 +83,75 @@ qrmine = "qrmine.main:main_routine" [tool.setuptools] zip-safe = false include-package-data = true -platforms = [ "any",] - -[tool.aliases] -release = "sdist bdist_wheel upload" - -[tool.flake8] -max_line_length = "88" -extend_ignore = "E203, W503" -exclude = "\n.tox\nbuild\ndist\n.eggs\ndocs/conf.py" +package-dir = {"" = "src"} +# Require a min/specific Python version (comma-separated conditions) +# python_requires = >=3.8 +# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. +# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in +# new major versions. This works if the required packages follow Semantic Versioning. +# For more information, check out https://semver.org/. +platforms = ["any"] -[tool.pyscaffold] -version = "4.6" -package = "qrmine" - -[tool.setuptools.package-dir] -"" = "src" +[tool.setuptools.packages.find] +where = ["src"] +exclude = ["tests"] +namespaces = true [tool.pytest.ini_options] -addopts = "\n--verbose" -norecursedirs = [ "dist", "build", ".tox",] +# Specify command line options as you would do when invoking pytest directly. +# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml +# in order to write a coverage file that can be read by Jenkins. +# CAUTION: --cov flags may prohibit setting breakpoints while debugging. +# Comment those flags to avoid this pytest issue. +addopts = """ +--verbose""" +norecursedirs = [ + "dist", + "build", + ".tox", +] + +[tool.aliases] +release = "sdist bdist_wheel upload" [tool.distutils.bdist_wheel] +# Use this option if your package is pure-python universal = 1 -[tool.distutils.build_sphinx] -source-dir = "docs" -build-dir = "docs/_build" +[tool.build_sphinx] +source_dir = "docs" +build_dir = "docs/_build" testpaths = "tests" +# Use pytest markers to select/deselect specific tests +# markers = +# slow: mark tests as slow (deselect with '-m "not slow"') +# system: mark end-to-end system tests [tool.devpi.upload] +# Options for the devpi: PyPI server and packaging tool +# VCS export must be deactivated since we are using setuptools-scm no_vcs = "1" formats = "bdist_wheel" -[tool.setuptools.packages.find] -where = [ "src",] -exclude = [ "tests",] -namespaces = true +[tool.flake8] +# Some sane defaults for the code style checker flake8 +max_line_length = "88" +extend_ignore = "E203, W503" +# ^ Black-compatible +# E203 and W503 have edge cases handled by black +exclude = """ +.tox +build +dist +.eggs +docs/conf.py""" + +[tool.pyscaffold] +# PyScaffold's parameters when the project was created. +# This will be used when updating. Do not change! +version = "4.6" +package = "qrmine" +# This file is used to configure your project. +# Read more about the various options under: +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html +# https://setuptools.pypa.io/en/latest/references/keywords.html diff --git a/setup.cfg b/setup.cfg index dba42c2..82c523d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,6 +65,8 @@ install_requires = spacy textacy torch + pypdf + requests [options.packages.find] where = src diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py index a460795..522eaaa 100644 --- a/src/qrmine/readfiles.py +++ b/src/qrmine/readfiles.py @@ -1,5 +1,6 @@ import re - +import requests +from pypdf import PdfReader class ReadData(object): def __init__(self): @@ -37,22 +38,10 @@ def append(self, title, document): self._documents.append(document) self._content += document - def read_file(self, file_names): - if len(file_names) > 1: - for file_name in file_names: - with open(file_name, 'r') as f: - read_from_file = f.read() - self._content = re.sub('<[^<]+?>', '', read_from_file) - self._documents = re.split('.*?', read_from_file) - # Delete the last blank record - del self._documents[-1] - pattern = r"(.*?)" - _title = re.findall(pattern, read_from_file, flags=re.DOTALL)[0] - self._titles.append(_title) - f.close() - else: - file_name = file_names[0] - with open(file_name, 'r') as f: + def read_file(self, input): + # if input is a file name + if isinstance(input, str): + with open(input, 'r') as f: read_from_file = f.read() self._content = re.sub('<[^<]+?>', '', read_from_file) self._documents = re.split('.*?', read_from_file) @@ -60,25 +49,54 @@ def read_file(self, file_names): del self._documents[-1] pattern = r"(.*?)" self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL) + # if input is a folder name + elif isinstance(input, str): + import os + for file_name in os.listdir(input): + if file_name.endswith('.txt'): + with open(os.path.join(input, file_name), 'r') as f: + read_from_file = f.read() + self._content += read_from_file + self._documents.append(read_from_file) + self.titles.append(file_name) + if file_name.endswith('.pdf'): + with open(os.path.join(input, file_name), 'rb') as f: + reader = PdfReader(f) + read_from_file = "" + for page in reader.pages: + read_from_file += page.extract_text() + self._content += read_from_file + self._documents.append(read_from_file) + self.titles.append(file_name) + # if input is a url + elif isinstance(input, str): + response = requests.get(input) + if response.status_code == 200: + read_from_file = response.text + self._content = re.sub('<[^<]+?>', '', read_from_file) + self._documents = re.split('.*?', read_from_file) + # Delete the last blank record + del self._documents[-1] + pattern = r"(.*?)" + self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL) + else: + raise ValueError("Input must be a file name, folder name or url.") - """ - Combine duplicate topics using Dict - Currently supported only for single file. - """ - - doc_dict = {} - ct3 = 0 - for t in self._titles: - doc = doc_dict.get(t) - if doc: - doc_dict[t] = doc + self._documents[ct3] - else: - doc_dict[t] = self._documents[ct3] - ct3 += 1 - self._titles.clear() - self._documents.clear() - for t in doc_dict.keys(): - self._documents.append(doc_dict.get(t)) - self._titles.append(t) + """ + Combine duplicate topics using Dict + """ - f.close() + doc_dict = {} + ct3 = 0 + for t in self._titles: + doc = doc_dict.get(t) + if doc: + doc_dict[t] = doc + self._documents[ct3] + else: + doc_dict[t] = self._documents[ct3] + ct3 += 1 + self._titles.clear() + self._documents.clear() + for t in doc_dict.keys(): + self._documents.append(doc_dict.get(t)) + self._titles.append(t) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 4ad331d..4a07298 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -8,7 +8,7 @@ def corpus_fixture(): from src.qrmine import ReadData corpus = ReadData() file_path = resource_filename('src.qrmine.resources', 'interview.txt') - corpus.read_file([file_path]) + corpus.read_file(file_path) return corpus # instannce of Qrmine as fixture diff --git a/tests/test_readfiles.py b/tests/test_readfiles.py index aff3a5d..963ed90 100644 --- a/tests/test_readfiles.py +++ b/tests/test_readfiles.py @@ -8,8 +8,8 @@ def corpus_fixture(): from src.qrmine import ReadData corpus = ReadData() file_path = resource_filename('src.qrmine.resources', 'interview.txt') - corpus.read_file([file_path]) - return corpus + corpus.read_file(file_path) + return corpus def test_content(corpus_fixture): From da92557311b3dcd0c514bc8ff0f94112ca44de41 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 16:59:29 +0000 Subject: [PATCH 06/35] feat: update ReadData class to handle URL input by storing content and appending to documents --- src/qrmine/readfiles.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py index 522eaaa..a213ff7 100644 --- a/src/qrmine/readfiles.py +++ b/src/qrmine/readfiles.py @@ -73,12 +73,9 @@ def read_file(self, input): response = requests.get(input) if response.status_code == 200: read_from_file = response.text - self._content = re.sub('<[^<]+?>', '', read_from_file) - self._documents = re.split('.*?', read_from_file) - # Delete the last blank record - del self._documents[-1] - pattern = r"(.*?)" - self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL) + self._content = read_from_file + self._documents.append(read_from_file) + self.titles.append(input) else: raise ValueError("Input must be a file name, folder name or url.") From 9e2d7fb367c3466ae444c0302f8776c44b282f38 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 19:43:12 +0000 Subject: [PATCH 07/35] feat: add ClusterDocs class for semantic clustering and update tests for functionality --- pyproject.toml | 1 + setup.cfg | 1 + src/qrmine/__init__.py | 1 + src/qrmine/cluster.py | 60 ++++++++++++++++++++++++++++++++++++++++++ src/qrmine/content.py | 4 +++ test.py | 33 +++++++++++++++++++++++ tests/test_nlp.py | 13 +++++++++ 7 files changed, 113 insertions(+) create mode 100644 src/qrmine/cluster.py create mode 100644 test.py diff --git a/pyproject.toml b/pyproject.toml index abba09a..58ddc98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "torch", "pypdf", "requests", + "gensim", ] dynamic = ["version"] diff --git a/setup.cfg b/setup.cfg index 82c523d..d832a6e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,6 +67,7 @@ install_requires = torch pypdf requests + gensim [options.packages.find] where = src diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py index 09a4e35..22d0eb9 100644 --- a/src/qrmine/__init__.py +++ b/src/qrmine/__init__.py @@ -6,6 +6,7 @@ from .readfiles import ReadData from .sentiment import Sentiment from .mlqrmine import MLQRMine +from .cluster import ClusterDocs if sys.version_info[:2] >= (3, 8): # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py new file mode 100644 index 0000000..e16266d --- /dev/null +++ b/src/qrmine/cluster.py @@ -0,0 +1,60 @@ +import spacy +from gensim import corpora +from gensim.models.ldamodel import LdaModel + +class ClusterDocs: + + def __init__(self, documents=[], titles=[]): + self._nlp = spacy.load("en_core_web_sm") + self._documents = documents + self._titles = titles + self._dictionary = None + self._corpus = None + # Apply preprocessing to each document + self._processed_docs = [self.preprocess(doc) for doc in documents] + self.process() + + @property + def documents(self): + return self._documents + + @property + def titles(self): + return self._titles + + @documents.setter + def documents(self, documents): + self._documents = documents + self._processed_docs = [self.preprocess(doc) for doc in documents] + self.process() + + @titles.setter + def titles(self, titles): + self._titles = titles + + # Preprocess the documents using spaCy + def preprocess(self, doc): + # Tokenize and preprocess each document + doc = self._nlp(doc) + # Lemmatize and remove stop words + tokens = [token.lemma_ for token in doc if not token.is_stop] + return tokens + + def process(self): + # Create a dictionary representation of the documents + self._dictionary = corpora.Dictionary(self._processed_docs) + # Create a bag-of-words representation of the documents + self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs] + + def print_topics(self, num_topics=5, passes=15): + # Build the LDA (Latent Dirichlet Allocation) model + lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes) + # Print the topics and their corresponding words + print(lda_model.print_topics(num_words=5)) + + def print_clusters(self, num_topics=5, passes=15): + # Perform semantic clustering + lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes) + for i, doc in enumerate(self._processed_docs): # Changed from get_processed_docs() to _documents + bow = self._dictionary.doc2bow(doc) + print(f"Document {self._titles[i]} belongs to topic: {lda_model.get_document_topics(bow)}") diff --git a/src/qrmine/content.py b/src/qrmine/content.py index 3344a80..f9e6b0e 100644 --- a/src/qrmine/content.py +++ b/src/qrmine/content.py @@ -87,6 +87,10 @@ def idx(self, token): def doc(self): return self._processed + @property + def tokens(self): + return [token for token in self._processed if not token.is_stop and not token.is_punct and not token.is_space] + def process(self): for token in self._processed: if token.is_stop or token.is_digit or token.is_punct or token.is_space: diff --git a/test.py b/test.py new file mode 100644 index 0000000..a5c4b31 --- /dev/null +++ b/test.py @@ -0,0 +1,33 @@ +import spacy + +# Load spaCy model +nlp = spacy.load("en_core_web_sm") + +# Sample documents +documents = [ + "Natural language processing is a field of AI.", + "Topic modeling helps in uncovering the main themes in a collection of documents.", + "Semantic clustering groups similar documents together based on meaning.", + "SpaCy is a popular NLP library.", + "Gensim is commonly used for topic modeling.", +] + + +# Preprocess the documents using spaCy +def preprocess(doc): + # Tokenize and preprocess each document + doc = nlp(doc) + print(f"Original Document: {doc}") + # Lemmatize and remove stop words + tokens = [token.lemma_ for token in doc if not token.is_stop] + print(f"Processed Tokens: {tokens}") + return tokens + + +# Apply preprocessing to each document +processed_docs = [preprocess(doc) for doc in documents] + + +# Print the processed documents +for i, doc in enumerate(processed_docs): + print(f"Document {i + 1}: {doc}") \ No newline at end of file diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 4a07298..c94d03f 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -18,6 +18,12 @@ def q(): _q = Qrmine() return _q +@pytest.fixture +def cluster(): + from src.qrmine import ClusterDocs + _cluster = ClusterDocs() + return _cluster + # Ref: https://docs.pytest.org/en/latest/capture.html def test_generate_dict(corpus_fixture, capsys, q): from src.qrmine import Content @@ -50,6 +56,13 @@ def test_category_association(corpus_fixture, capsys, q): print(captured.out) assert 'theory' in captured.out +def test_cluster_topics(corpus_fixture, capsys, cluster): + cluster.documents = corpus_fixture.documents + cluster.titles = corpus_fixture.titles + cluster.print_clusters() + captured = capsys.readouterr() + print(captured.out) + assert 'Document' in captured.out From 829214c4daf8b9fa87060830431be36b46a40485 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:01:18 +0000 Subject: [PATCH 08/35] feat: enhance ClusterDocs class with num_topics and passes properties, update LDA model methods; add tests for topic printing --- src/qrmine/cluster.py | 42 ++++++++++++++++++++++++++++++++++-------- tests/test_nlp.py | 4 ++++ 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py index e16266d..495f40d 100644 --- a/src/qrmine/cluster.py +++ b/src/qrmine/cluster.py @@ -1,15 +1,18 @@ import spacy from gensim import corpora from gensim.models.ldamodel import LdaModel - +from pprint import pprint class ClusterDocs: def __init__(self, documents=[], titles=[]): self._nlp = spacy.load("en_core_web_sm") self._documents = documents self._titles = titles + self._num_topics = 5 + self._passes = 15 self._dictionary = None self._corpus = None + self._lda_model = None # Apply preprocessing to each document self._processed_docs = [self.preprocess(doc) for doc in documents] self.process() @@ -22,6 +25,14 @@ def documents(self): def titles(self): return self._titles + @property + def num_topics(self): + return self._num_topics + + @property + def passes(self): + return self._passes + @documents.setter def documents(self, documents): self._documents = documents @@ -32,6 +43,14 @@ def documents(self, documents): def titles(self, titles): self._titles = titles + @num_topics.setter + def num_topics(self, num_topics): + self._num_topics = num_topics + + @passes.setter + def passes(self, passes): + self._passes = passes + # Preprocess the documents using spaCy def preprocess(self, doc): # Tokenize and preprocess each document @@ -45,16 +64,23 @@ def process(self): self._dictionary = corpora.Dictionary(self._processed_docs) # Create a bag-of-words representation of the documents self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs] - - def print_topics(self, num_topics=5, passes=15): # Build the LDA (Latent Dirichlet Allocation) model - lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes) + + def build_lda_model(self): + self._lda_model = LdaModel( + self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes + ) + + def print_topics(self, num_words=5): + if self._lda_model is None: + self.build_lda_model() # Print the topics and their corresponding words - print(lda_model.print_topics(num_words=5)) + pprint(self._lda_model.print_topics(num_words=num_words)) - def print_clusters(self, num_topics=5, passes=15): + def print_clusters(self): + if self._lda_model is None: + self.build_lda_model() # Perform semantic clustering - lda_model = LdaModel(self._corpus, num_topics=num_topics, id2word=self._dictionary, passes=passes) for i, doc in enumerate(self._processed_docs): # Changed from get_processed_docs() to _documents bow = self._dictionary.doc2bow(doc) - print(f"Document {self._titles[i]} belongs to topic: {lda_model.get_document_topics(bow)}") + print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}") diff --git a/tests/test_nlp.py b/tests/test_nlp.py index c94d03f..4576da5 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -63,6 +63,10 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): captured = capsys.readouterr() print(captured.out) assert 'Document' in captured.out + cluster.print_topics() + captured = capsys.readouterr() + print(captured.out) + assert 'topic' in captured.out From 03def281510c6a14fd062050c51d89e3951119a7 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:18:25 +0000 Subject: [PATCH 09/35] feat: add format_topics_sentences method to ClusterDocs for topic formatting; update tests to validate output structure --- src/qrmine/cluster.py | 31 +++++++++++++++++++++++++++++++ tests/test_nlp.py | 17 +++++++++++++---- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py index 495f40d..fb622df 100644 --- a/src/qrmine/cluster.py +++ b/src/qrmine/cluster.py @@ -1,6 +1,7 @@ import spacy from gensim import corpora from gensim.models.ldamodel import LdaModel +import pandas as pd from pprint import pprint class ClusterDocs: @@ -67,6 +68,8 @@ def process(self): # Build the LDA (Latent Dirichlet Allocation) model def build_lda_model(self): + if self._lda_model is not None: + return self._lda_model = LdaModel( self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes ) @@ -84,3 +87,31 @@ def print_clusters(self): for i, doc in enumerate(self._processed_docs): # Changed from get_processed_docs() to _documents bow = self._dictionary.doc2bow(doc) print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}") + + + def format_topics_sentences(self): + self.build_lda_model() + # Init output + sent_topics_df = pd.DataFrame() + + # Get main topic in each document + for i, row_list in enumerate(self._lda_model[self._corpus]): + row = row_list[0] if self._lda_model.per_word_topics else row_list + # print(row) + row = sorted(row, key=lambda x: (x[1]), reverse=True) + # Get the Dominant topic, Perc Contribution and Keywords for each document + for j, (topic_num, prop_topic) in enumerate(row): + if j == 0: # => dominant topic + wp = self._lda_model.show_topic(topic_num) + topic_keywords = ", ".join([word for word, prop in wp]) + new_row = pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]], + columns=["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]) + sent_topics_df = pd.concat([sent_topics_df, new_row], ignore_index=True) + else: + break + sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"] + + # Add original text to the end of the output + contents = pd.Series(self._processed_docs) + sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) + return sent_topics_df diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 4576da5..9220aa9 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -1,7 +1,6 @@ import pytest - @pytest.fixture def corpus_fixture(): from pkg_resources import resource_filename @@ -67,6 +66,16 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): captured = capsys.readouterr() print(captured.out) assert 'topic' in captured.out - - - + # Format + df_topic_sents_keywords = cluster.format_topics_sentences() + # Format the output + df_dominant_topic = df_topic_sents_keywords.reset_index() + df_dominant_topic.columns = [ + "Document_No", + "Dominant_Topic", + "Topic_Perc_Contrib", + "Keywords", + "Text", + ] + print(df_dominant_topic.head(10)) + assert 'Document_No' in df_dominant_topic.columns From 1fba6578f1570698dee2aa3dba805f6c0d0df53b Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:35:17 +0000 Subject: [PATCH 10/35] feat: add most_representative_docs method to ClusterDocs for retrieving top documents by topic; update tests for new functionality --- src/qrmine/cluster.py | 20 ++++++++++++++++++-- tests/test_nlp.py | 7 +++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py index fb622df..34f583b 100644 --- a/src/qrmine/cluster.py +++ b/src/qrmine/cluster.py @@ -88,7 +88,6 @@ def print_clusters(self): bow = self._dictionary.doc2bow(doc) print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}") - def format_topics_sentences(self): self.build_lda_model() # Init output @@ -114,4 +113,21 @@ def format_topics_sentences(self): # Add original text to the end of the output contents = pd.Series(self._processed_docs) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) - return sent_topics_df + return sent_topics_df.reset_index(drop=False) + + # https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/ + def most_representative_docs(self): + sent_topics_df = self.format_topics_sentences() + sent_topics_sorteddf_mallet = pd.DataFrame() + sent_topics_outdf_grpd = sent_topics_df.groupby("Dominant_Topic") + + for i, grp in sent_topics_outdf_grpd: + sent_topics_sorteddf_mallet = pd.concat( + [ + sent_topics_sorteddf_mallet, + grp.sort_values(["Perc_Contribution"], ascending=False).head(1), + ], + axis=0, + ) + + return sent_topics_sorteddf_mallet diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 9220aa9..c2ef347 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -67,9 +67,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): print(captured.out) assert 'topic' in captured.out # Format - df_topic_sents_keywords = cluster.format_topics_sentences() + df_dominant_topic = cluster.format_topics_sentences() # Format the output - df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = [ "Document_No", "Dominant_Topic", @@ -79,3 +78,7 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): ] print(df_dominant_topic.head(10)) assert 'Document_No' in df_dominant_topic.columns + + df_sorted = cluster.most_representative_docs() + print(df_sorted.head(10)) + assert 'Dominant_Topic' in df_sorted.columns \ No newline at end of file From 8402717c95e1f1b8934c9cfd977de0758f04c550 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:38:05 +0000 Subject: [PATCH 11/35] feat: add seaborn to dependencies and create visualize.py for data visualization --- pyproject.toml | 1 + setup.cfg | 1 + src/qrmine/visualize.py | 5 +++++ 3 files changed, 7 insertions(+) create mode 100644 src/qrmine/visualize.py diff --git a/pyproject.toml b/pyproject.toml index 58ddc98..4e87878 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "pypdf", "requests", "gensim", + "seaborn", ] dynamic = ["version"] diff --git a/setup.cfg b/setup.cfg index d832a6e..f0582c1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -68,6 +68,7 @@ install_requires = pypdf requests gensim + seaborn [options.packages.find] where = src diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py new file mode 100644 index 0000000..fe24032 --- /dev/null +++ b/src/qrmine/visualize.py @@ -0,0 +1,5 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + + From d3f41bf208ef18206307a8402ebd29e59952584c Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:54:30 +0000 Subject: [PATCH 12/35] feat: enhance QRVisualize class with frequency distribution plotting and add corresponding tests --- src/qrmine/__init__.py | 1 + src/qrmine/resources/df_dominant_topic.csv | 12 +++++++++ src/qrmine/visualize.py | 31 ++++++++++++++++++++++ tests/test_visualize.py | 17 ++++++++++++ 4 files changed, 61 insertions(+) create mode 100644 src/qrmine/resources/df_dominant_topic.csv create mode 100644 tests/test_visualize.py diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py index 22d0eb9..3549721 100644 --- a/src/qrmine/__init__.py +++ b/src/qrmine/__init__.py @@ -7,6 +7,7 @@ from .sentiment import Sentiment from .mlqrmine import MLQRMine from .cluster import ClusterDocs +from .visualize import QRVisualize if sys.version_info[:2] >= (3, 8): # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` diff --git a/src/qrmine/resources/df_dominant_topic.csv b/src/qrmine/resources/df_dominant_topic.csv new file mode 100644 index 0000000..115eb63 --- /dev/null +++ b/src/qrmine/resources/df_dominant_topic.csv @@ -0,0 +1,12 @@ +,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text +0,0,4,0.9903,"., GT, Strauss, ,, coding, +, ), Theory, seminal, (","['ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']" +1,1,1,0.7811,",, theory, ., GT, evaluation, structure, coding, +, ), (","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']" +2,2,1,0.9783,",, theory, ., GT, evaluation, structure, coding, +, ), (","['\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n']" +3,3,3,0.9952,"., ,, coding, category, open, QRMine, datum, researcher, code, GT","['\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n']" +4,4,4,0.9793,"., GT, Strauss, ,, coding, +, ), Theory, seminal, (","['\n', 'ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n']" +5,5,2,0.9712,"category, comparison, incident, ,, +, involve, refine, identify, emergence, constant","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n']" diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index fe24032..ecf9024 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -1,5 +1,36 @@ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns +import numpy as np +class QRVisualize: + def __init__(self, data: pd.DataFrame): + """ + Initialize the QRVisualize class with a DataFrame. + Parameters: + data (pd.DataFrame): The DataFrame containing the data to visualize. + """ + self.data = data + + def plot_frequency_distribution_of_words(self, df, folder_path=None): + doc_lens = [len(d) for d in df.Text] + + # Plot + plt.figure(figsize=(16,7), dpi=160) + plt.hist(doc_lens, bins = 1000, color='navy') + plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens)))) + plt.text(750, 90, "Median : " + str(round(np.median(doc_lens)))) + plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens)))) + plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01)))) + plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99)))) + + plt.gca().set(xlim=(0, 1000), ylabel='Number of Documents', xlabel='Document Word Count') + plt.tick_params(size=16) + plt.xticks(np.linspace(0,1000,9)) + plt.title('Distribution of Document Word Counts', fontdict=dict(size=22)) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() diff --git a/tests/test_visualize.py b/tests/test_visualize.py new file mode 100644 index 0000000..58982e0 --- /dev/null +++ b/tests/test_visualize.py @@ -0,0 +1,17 @@ +import pytest +import pandas as pd +from src.qrmine.visualize import QRVisualize + +@pytest.fixture +def v(): + from pkg_resources import resource_filename + file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv") + data = pd.read_csv(file_path) + _v = QRVisualize(data) + return _v + +def test_frequency_distribution_of_words(v, capsys): + v.plot_frequency_distribution_of_words(v.data, folder_path='/tmp/frequency_distribution.png') + captured = capsys.readouterr() + print(captured.out) + From 38a8481f3cf505fff0854dd57fee2034daf69e40 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:56:14 +0000 Subject: [PATCH 13/35] feat: update QRVisualize constructor to accept optional DataFrame and modify plot method to use instance data if no DataFrame is provided --- src/qrmine/visualize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index ecf9024..e9f3769 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -4,7 +4,7 @@ import numpy as np class QRVisualize: - def __init__(self, data: pd.DataFrame): + def __init__(self, data: pd.DataFrame = None): """ Initialize the QRVisualize class with a DataFrame. @@ -13,7 +13,9 @@ def __init__(self, data: pd.DataFrame): """ self.data = data - def plot_frequency_distribution_of_words(self, df, folder_path=None): + def plot_frequency_distribution_of_words(self, df=None, folder_path=None): + if df is None: + df = self.data doc_lens = [len(d) for d in df.Text] # Plot From 60572ef87706a706dedeadb7444e2dffc968a930 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 21:00:30 +0000 Subject: [PATCH 14/35] feat: add plot_distribution_by_topic method to QRVisualize for visualizing document word counts by dominant topic; update tests accordingly --- src/qrmine/visualize.py | 29 +++++++++++++++++++++++++++++ tests/test_visualize.py | 4 ++++ 2 files changed, 33 insertions(+) diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index e9f3769..3bb0548 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -2,6 +2,7 @@ import matplotlib.pyplot as plt import seaborn as sns import numpy as np +import matplotlib.colors as mcolors class QRVisualize: def __init__(self, data: pd.DataFrame = None): @@ -36,3 +37,31 @@ def plot_frequency_distribution_of_words(self, df=None, folder_path=None): if folder_path: plt.savefig(folder_path) plt.close() + + def plot_distribution_by_topic(self, df=None, folder_path=None): + if df is None: + df = self.data + # Plot + cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS' + + fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True) + + for i, ax in enumerate(axes.flatten()): + df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :] + doc_lens = [len(d) for d in df_dominant_topic_sub.Text] + ax.hist(doc_lens, bins = 1000, color=cols[i]) + ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i]) + sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx()) + ax.set(xlim=(0, 1000), xlabel='Document Word Count') + ax.set_ylabel('Number of Documents', color=cols[i]) + ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i])) + + fig.tight_layout() + fig.subplots_adjust(top=0.90) + plt.xticks(np.linspace(0,1000,9)) + fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() \ No newline at end of file diff --git a/tests/test_visualize.py b/tests/test_visualize.py index 58982e0..3f22d79 100644 --- a/tests/test_visualize.py +++ b/tests/test_visualize.py @@ -15,3 +15,7 @@ def test_frequency_distribution_of_words(v, capsys): captured = capsys.readouterr() print(captured.out) +def test_distribution_by_topic(v, capsys): + v.plot_distribution_by_topic(v.data, folder_path='/tmp/distribution_by_topic.png') + captured = capsys.readouterr() + print(captured.out) From 4d30f7cdaf9fe82890a34bb0b7903e67de87f6b9 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 21:01:46 +0000 Subject: [PATCH 15/35] feat: add wordcloud to dependencies for enhanced visualization capabilities --- pyproject.toml | 1 + setup.cfg | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4e87878..03ef32a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "requests", "gensim", "seaborn", + "wordcloud", ] dynamic = ["version"] diff --git a/setup.cfg b/setup.cfg index f0582c1..c85767c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -69,6 +69,7 @@ install_requires = requests gensim seaborn + wordcloud [options.packages.find] where = src From 28ee69bacbf15d7a235947c734a51e22a444a233 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 21:12:45 +0000 Subject: [PATCH 16/35] feat: refactor build_lda_model method to return topics and improve logic; add topics fixture for testing --- src/qrmine/cluster.py | 12 +++---- src/qrmine/visualize.py | 4 ++- tests/test_nlp.py | 2 ++ tests/test_visualize.py | 80 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py index 34f583b..af36476 100644 --- a/src/qrmine/cluster.py +++ b/src/qrmine/cluster.py @@ -68,12 +68,12 @@ def process(self): # Build the LDA (Latent Dirichlet Allocation) model def build_lda_model(self): - if self._lda_model is not None: - return - self._lda_model = LdaModel( - self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes - ) - + if self._lda_model is None: + self._lda_model = LdaModel( + self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes + ) + return self._lda_model.show_topics(formatted=False) + def print_topics(self, num_words=5): if self._lda_model is None: self.build_lda_model() diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index 3bb0548..04531cb 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -3,6 +3,8 @@ import seaborn as sns import numpy as np import matplotlib.colors as mcolors +from wordcloud import WordCloud, STOPWORDS + class QRVisualize: def __init__(self, data: pd.DataFrame = None): @@ -64,4 +66,4 @@ def plot_distribution_by_topic(self, df=None, folder_path=None): # save if folder_path: plt.savefig(folder_path) - plt.close() \ No newline at end of file + plt.close() diff --git a/tests/test_nlp.py b/tests/test_nlp.py index c2ef347..ad82495 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -66,6 +66,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): captured = capsys.readouterr() print(captured.out) assert 'topic' in captured.out + + print(cluster.build_lda_model()) # Format df_dominant_topic = cluster.format_topics_sentences() # Format the output diff --git a/tests/test_visualize.py b/tests/test_visualize.py index 3f22d79..67105ee 100644 --- a/tests/test_visualize.py +++ b/tests/test_visualize.py @@ -10,6 +10,86 @@ def v(): _v = QRVisualize(data) return _v +@pytest.fixture +def topics(): + return [ + ( + 0, + [ + (".", 0.095292516), + (",", 0.053392828), + ("category", 0.032462463), + ("coding", 0.032456465), + ("open", 0.032437164), + ("QRMine", 0.03243305), + ("datum", 0.021980358), + ("researcher", 0.021978099), + ("theory", 0.011536299), + ("GT", 0.011533132), + ], + ), + ( + 1, + [ + (".", 0.007783216), + (",", 0.007773952), + ("open", 0.007728422), + ("researcher", 0.0077227736), + ("coding", 0.007722049), + ("category", 0.007721938), + ("datum", 0.007717547), + ("QRMine", 0.007716193), + ("dissect", 0.0077070068), + ("support", 0.0077060354), + ], + ), + ( + 2, + [ + (",", 0.05126711), + (".", 0.05125151), + ("theory", 0.038604487), + ("category", 0.03227912), + ("GT", 0.032278605), + ("\n", 0.029119665), + ("comparison", 0.025947908), + ("coding", 0.025941858), + ("incident", 0.019622542), + (")", 0.019619444), + ], + ), + ( + 3, + [ + (".", 0.007849805), + (",", 0.007837688), + ("theory", 0.00781459), + ("coding", 0.0078089647), + ("category", 0.0077514737), + ("GT", 0.0077493717), + ("datum", 0.007742789), + ("open", 0.0077355755), + ("\n", 0.0077245855), + ("researcher", 0.0077191954), + ], + ), + ( + 4, + [ + (",", 0.007834569), + (".", 0.007812336), + ("coding", 0.0077863215), + ("category", 0.007759207), + ("theory", 0.0077459146), + ("GT", 0.0077370973), + ("code", 0.0077265715), + ("datum", 0.007720947), + ("open", 0.007720898), + ("comparison", 0.007720567), + ], + ), + ] + def test_frequency_distribution_of_words(v, capsys): v.plot_frequency_distribution_of_words(v.data, folder_path='/tmp/frequency_distribution.png') captured = capsys.readouterr() From 643d4698b233e89257e711051505ac486572b9c4 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 21:21:53 +0000 Subject: [PATCH 17/35] feat: add plot_wordcloud method to QRVisualize for visualizing topics; include corresponding test --- src/qrmine/visualize.py | 32 ++++++++++++++++++++++++++++++++ tests/test_visualize.py | 5 +++++ 2 files changed, 37 insertions(+) diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index 04531cb..49dfc05 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -67,3 +67,35 @@ def plot_distribution_by_topic(self, df=None, folder_path=None): if folder_path: plt.savefig(folder_path) plt.close() + + def plot_wordcloud(self, topics=None, folder_path=None): + cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS' + + cloud = WordCloud(stopwords=STOPWORDS, + background_color='white', + width=2500, + height=1800, + max_words=10, + colormap='tab10', + color_func=lambda *args, **kwargs: cols[i], + prefer_horizontal=1.0) + + fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True) + + for i, ax in enumerate(axes.flatten()): + fig.add_subplot(ax) + topic_words = dict(topics[i][1]) + cloud.generate_from_frequencies(topic_words, max_font_size=300) + plt.gca().imshow(cloud) + plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) + plt.gca().axis('off') + + plt.subplots_adjust(wspace=0, hspace=0) + plt.axis("off") + plt.margins(x=0, y=0) + plt.tight_layout() + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() \ No newline at end of file diff --git a/tests/test_visualize.py b/tests/test_visualize.py index 67105ee..ae35c39 100644 --- a/tests/test_visualize.py +++ b/tests/test_visualize.py @@ -99,3 +99,8 @@ def test_distribution_by_topic(v, capsys): v.plot_distribution_by_topic(v.data, folder_path='/tmp/distribution_by_topic.png') captured = capsys.readouterr() print(captured.out) + +def test_plot_wordcloud(v, topics, capsys): + v.plot_wordcloud(topics, folder_path='/tmp/wordcloud.png') + captured = capsys.readouterr() + print(captured.out) From bb5699655fdfba9497c37f1b65d74aac3492e16d Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 21:25:51 +0000 Subject: [PATCH 18/35] feat: update plot_wordcloud method parameters for improved visualization; reduce size and max words --- src/qrmine/visualize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index 49dfc05..6daca87 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -73,9 +73,9 @@ def plot_wordcloud(self, topics=None, folder_path=None): cloud = WordCloud(stopwords=STOPWORDS, background_color='white', - width=2500, - height=1800, - max_words=10, + width=250, + height=180, + max_words=5, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) From fc49b94be83c4c56b5bea6888d3f9a70cf33bcdf Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 21:46:33 +0000 Subject: [PATCH 19/35] feat: add processed_docs property and topics_per_document method to ClusterDocs; enhance document processing capabilities --- src/qrmine/cluster.py | 18 +++++- src/qrmine/visualize.py | 134 +++++++++++++++++++++++++++++++++++++++- tests/test_nlp.py | 2 + 3 files changed, 151 insertions(+), 3 deletions(-) diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py index af36476..4370d8a 100644 --- a/src/qrmine/cluster.py +++ b/src/qrmine/cluster.py @@ -34,6 +34,10 @@ def num_topics(self): def passes(self): return self._passes + @property + def processed_docs(self): + return self._processed_docs + @documents.setter def documents(self, documents): self._documents = documents @@ -73,7 +77,7 @@ def build_lda_model(self): self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes ) return self._lda_model.show_topics(formatted=False) - + def print_topics(self, num_words=5): if self._lda_model is None: self.build_lda_model() @@ -131,3 +135,15 @@ def most_representative_docs(self): ) return sent_topics_sorteddf_mallet + + + def topics_per_document(self, start=0, end=1): + corpus_sel = self._corpus[start:end] + dominant_topics = [] + topic_percentages = [] + for i, corp in enumerate(corpus_sel): + topic_percs = self._lda_model[corp] + dominant_topic = sorted(topic_percs, key=lambda x: x[1], reverse=True)[0][0] + dominant_topics.append((i, dominant_topic)) + topic_percentages.append(topic_percs) + return (dominant_topics, topic_percentages) diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index 6daca87..bce60e5 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -4,7 +4,9 @@ import numpy as np import matplotlib.colors as mcolors from wordcloud import WordCloud, STOPWORDS - +from collections import Counter +from matplotlib.patches import Rectangle +from sklearn.manifold import TSNE class QRVisualize: def __init__(self, data: pd.DataFrame = None): @@ -98,4 +100,132 @@ def plot_wordcloud(self, topics=None, folder_path=None): # save if folder_path: plt.savefig(folder_path) - plt.close() \ No newline at end of file + plt.close() + + def plot_importance(self, topics=None, processed_docs=None, folder_path=None): + data_flat = [w for w_list in processed_docs for w in w_list] + counter = Counter(data_flat) + + out = [] + for i, topic in topics: + for word, weight in topic: + out.append([word, i, weight, counter[word]]) + + df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"]) + + # Plot Word Count and Weights of Topic Keywords + fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160) + cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] + for i, ax in enumerate(axes.flatten()): + ax.bar( + x="word", + height="word_count", + data=df.loc[df.topic_id == i, :], + color=cols[i], + width=0.5, + alpha=0.3, + label="Word Count", + ) + ax_twin = ax.twinx() + ax_twin.bar( + x="word", + height="importance", + data=df.loc[df.topic_id == i, :], + color=cols[i], + width=0.2, + label="Weights", + ) + ax.set_ylabel("Word Count", color=cols[i]) + ax_twin.set_ylim(0, 0.030) + ax.set_ylim(0, 3500) + ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16) + ax.tick_params(axis="y", left=False) + ax.set_xticklabels( + df.loc[df.topic_id == i, "word"], rotation=30, horizontalalignment="right" + ) + ax.legend(loc="upper left") + ax_twin.legend(loc="upper right") + + fig.tight_layout(w_pad=2) + fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + + def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13): + corp = corpus[start:end] + mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()] + + fig, axes = plt.subplots(end-start, 1, figsize=(20, (end-start)*0.95), dpi=160) + axes[0].axis('off') + for i, ax in enumerate(axes): + if i > 0: + corp_cur = corp[i-1] + topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur] + word_dominanttopic = [(lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics] + ax.text(0.01, 0.5, "Doc " + str(i-1) + ": ", verticalalignment='center', + fontsize=16, color='black', transform=ax.transAxes, fontweight=700) + + # Draw Rectange + topic_percs_sorted = sorted(topic_percs, key=lambda x: (x[1]), reverse=True) + ax.add_patch(Rectangle((0.0, 0.05), 0.99, 0.90, fill=None, alpha=1, + color=mycolors[topic_percs_sorted[0][0]], linewidth=2)) + + word_pos = 0.06 + for j, (word, topics) in enumerate(word_dominanttopic): + if j < 14: + ax.text(word_pos, 0.5, word, + horizontalalignment='left', + verticalalignment='center', + fontsize=16, color=mycolors[topics], + transform=ax.transAxes, fontweight=700) + word_pos += .009 * len(word) # to move the word for the next iter + ax.axis('off') + ax.text(word_pos, 0.5, '. . .', + horizontalalignment='left', + verticalalignment='center', + fontsize=16, color='black', + transform=ax.transAxes) + + plt.subplots_adjust(wspace=0, hspace=0) + plt.suptitle('Sentence Topic Coloring for Documents: ' + str(start) + ' to ' + str(end-2), fontsize=22, y=0.95, fontweight=700) + plt.tight_layout() + plt.show() + + def cluster_chart(self, lda_model=None, corpus=None, n_topics=4, folder_path=None): + # Get topic weights + topic_weights = [] + for i, row_list in enumerate(lda_model[corpus]): + topic_weights.append([w for i, w in row_list[0]]) + + # Array of topic weights + arr = pd.DataFrame(topic_weights).fillna(0).values + + # Keep the well separated points (optional) + arr = arr[np.amax(arr, axis=1) > 0.35] + + # Dominant topic number in each doc + topic_num = np.argmax(arr, axis=1) + + # tSNE Dimension Reduction + tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca') + tsne_lda = tsne_model.fit_transform(arr) + + + # Plot + plt.figure(figsize=(16, 10), dpi=160) + for i in range(n_topics): + plt.scatter(tsne_lda[topic_num == i, 0], tsne_lda[topic_num == i, 1], label=str(i), alpha=0.5) + plt.title('t-SNE Clustering of Topics', fontsize=22) + plt.xlabel('t-SNE Dimension 1', fontsize=16) + plt.ylabel('t-SNE Dimension 2', fontsize=16) + plt.legend(title='Topic Number', loc='upper right') + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + diff --git a/tests/test_nlp.py b/tests/test_nlp.py index ad82495..17b4c52 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -68,6 +68,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): assert 'topic' in captured.out print(cluster.build_lda_model()) + + print(cluster.topics_per_document()) # Format df_dominant_topic = cluster.format_topics_sentences() # Format the output From ddc6b96a025000ae6969a0919c44477783236669 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 21:56:32 +0000 Subject: [PATCH 20/35] feat: add copyright notice and improve code formatting in cluster and visualize modules; enhance test readability in test files --- src/qrmine/cluster.py | 60 ++++++-- src/qrmine/visualize.py | 297 ++++++++++++++++++++++++++++++---------- tests/test_nlp.py | 29 ++-- tests/test_visualize.py | 14 +- 4 files changed, 309 insertions(+), 91 deletions(-) diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py index 4370d8a..3e68ac3 100644 --- a/src/qrmine/cluster.py +++ b/src/qrmine/cluster.py @@ -1,8 +1,30 @@ +""" +Copyright (C) 2025 Bell Eapen + +This file is part of qrmine. + +qrmine is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +qrmine is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with qrmine. If not, see . +""" + +from pprint import pprint + +import pandas as pd import spacy from gensim import corpora from gensim.models.ldamodel import LdaModel -import pandas as pd -from pprint import pprint + + class ClusterDocs: def __init__(self, documents=[], titles=[]): @@ -74,7 +96,10 @@ def process(self): def build_lda_model(self): if self._lda_model is None: self._lda_model = LdaModel( - self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes + self._corpus, + num_topics=self._num_topics, + id2word=self._dictionary, + passes=self._passes, ) return self._lda_model.show_topics(formatted=False) @@ -88,9 +113,13 @@ def print_clusters(self): if self._lda_model is None: self.build_lda_model() # Perform semantic clustering - for i, doc in enumerate(self._processed_docs): # Changed from get_processed_docs() to _documents + for i, doc in enumerate( + self._processed_docs + ): # Changed from get_processed_docs() to _documents bow = self._dictionary.doc2bow(doc) - print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}") + print( + f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}" + ) def format_topics_sentences(self): self.build_lda_model() @@ -107,12 +136,24 @@ def format_topics_sentences(self): if j == 0: # => dominant topic wp = self._lda_model.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) - new_row = pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]], - columns=["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]) - sent_topics_df = pd.concat([sent_topics_df, new_row], ignore_index=True) + new_row = pd.DataFrame( + [[int(topic_num), round(prop_topic, 4), topic_keywords]], + columns=[ + "Dominant_Topic", + "Perc_Contribution", + "Topic_Keywords", + ], + ) + sent_topics_df = pd.concat( + [sent_topics_df, new_row], ignore_index=True + ) else: break - sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"] + sent_topics_df.columns = [ + "Dominant_Topic", + "Perc_Contribution", + "Topic_Keywords", + ] # Add original text to the end of the output contents = pd.Series(self._processed_docs) @@ -136,7 +177,6 @@ def most_representative_docs(self): return sent_topics_sorteddf_mallet - def topics_per_document(self, start=0, end=1): corpus_sel = self._corpus[start:end] dominant_topics = [] diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index bce60e5..4a7fc25 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -1,12 +1,34 @@ -import pandas as pd +""" +Copyright (C) 2025 Bell Eapen + +This file is part of qrmine. + +qrmine is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +qrmine is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with qrmine. If not, see . +""" + +from collections import Counter + +import matplotlib.colors as mcolors import matplotlib.pyplot as plt -import seaborn as sns import numpy as np -import matplotlib.colors as mcolors -from wordcloud import WordCloud, STOPWORDS -from collections import Counter +import pandas as pd +import seaborn as sns from matplotlib.patches import Rectangle +from matplotlib.ticker import FuncFormatter from sklearn.manifold import TSNE +from wordcloud import STOPWORDS, WordCloud + class QRVisualize: def __init__(self, data: pd.DataFrame = None): @@ -24,18 +46,20 @@ def plot_frequency_distribution_of_words(self, df=None, folder_path=None): doc_lens = [len(d) for d in df.Text] # Plot - plt.figure(figsize=(16,7), dpi=160) - plt.hist(doc_lens, bins = 1000, color='navy') + plt.figure(figsize=(16, 7), dpi=160) + plt.hist(doc_lens, bins=1000, color="navy") plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens)))) - plt.text(750, 90, "Median : " + str(round(np.median(doc_lens)))) - plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens)))) - plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01)))) - plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99)))) - - plt.gca().set(xlim=(0, 1000), ylabel='Number of Documents', xlabel='Document Word Count') + plt.text(750, 90, "Median : " + str(round(np.median(doc_lens)))) + plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens)))) + plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01)))) + plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99)))) + + plt.gca().set( + xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count" + ) plt.tick_params(size=16) - plt.xticks(np.linspace(0,1000,9)) - plt.title('Distribution of Document Word Counts', fontdict=dict(size=22)) + plt.xticks(np.linspace(0, 1000, 9)) + plt.title("Distribution of Document Word Counts", fontdict=dict(size=22)) plt.show() # save if folder_path: @@ -46,24 +70,30 @@ def plot_distribution_by_topic(self, df=None, folder_path=None): if df is None: df = self.data # Plot - cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS' + cols = [ + color for name, color in mcolors.TABLEAU_COLORS.items() + ] # more colors: 'mcolors.XKCD_COLORS' - fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True) + fig, axes = plt.subplots( + 2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True + ) for i, ax in enumerate(axes.flatten()): df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :] doc_lens = [len(d) for d in df_dominant_topic_sub.Text] - ax.hist(doc_lens, bins = 1000, color=cols[i]) - ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i]) + ax.hist(doc_lens, bins=1000, color=cols[i]) + ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i]) sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx()) - ax.set(xlim=(0, 1000), xlabel='Document Word Count') - ax.set_ylabel('Number of Documents', color=cols[i]) - ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i])) + ax.set(xlim=(0, 1000), xlabel="Document Word Count") + ax.set_ylabel("Number of Documents", color=cols[i]) + ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i])) fig.tight_layout() fig.subplots_adjust(top=0.90) - plt.xticks(np.linspace(0,1000,9)) - fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22) + plt.xticks(np.linspace(0, 1000, 9)) + fig.suptitle( + "Distribution of Document Word Counts by Dominant Topic", fontsize=22 + ) plt.show() # save if folder_path: @@ -71,26 +101,30 @@ def plot_distribution_by_topic(self, df=None, folder_path=None): plt.close() def plot_wordcloud(self, topics=None, folder_path=None): - cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS' - - cloud = WordCloud(stopwords=STOPWORDS, - background_color='white', - width=250, - height=180, - max_words=5, - colormap='tab10', - color_func=lambda *args, **kwargs: cols[i], - prefer_horizontal=1.0) - - fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True) + cols = [ + color for name, color in mcolors.TABLEAU_COLORS.items() + ] # more colors: 'mcolors.XKCD_COLORS' + + cloud = WordCloud( + stopwords=STOPWORDS, + background_color="white", + width=250, + height=180, + max_words=5, + colormap="tab10", + color_func=lambda *args, **kwargs: cols[i], + prefer_horizontal=1.0, + ) + + fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) - plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) - plt.gca().axis('off') + plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16)) + plt.gca().axis("off") plt.subplots_adjust(wspace=0, hspace=0) plt.axis("off") @@ -141,7 +175,9 @@ def plot_importance(self, topics=None, processed_docs=None, folder_path=None): ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16) ax.tick_params(axis="y", left=False) ax.set_xticklabels( - df.loc[df.topic_id == i, "word"], rotation=30, horizontalalignment="right" + df.loc[df.topic_id == i, "word"], + rotation=30, + horizontalalignment="right", ) ax.legend(loc="upper left") ax_twin.legend(loc="upper right") @@ -154,44 +190,87 @@ def plot_importance(self, topics=None, processed_docs=None, folder_path=None): plt.savefig(folder_path) plt.close() - def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13): corp = corpus[start:end] mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()] - fig, axes = plt.subplots(end-start, 1, figsize=(20, (end-start)*0.95), dpi=160) - axes[0].axis('off') + fig, axes = plt.subplots( + end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160 + ) + axes[0].axis("off") for i, ax in enumerate(axes): if i > 0: - corp_cur = corp[i-1] + corp_cur = corp[i - 1] topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur] - word_dominanttopic = [(lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics] - ax.text(0.01, 0.5, "Doc " + str(i-1) + ": ", verticalalignment='center', - fontsize=16, color='black', transform=ax.transAxes, fontweight=700) + word_dominanttopic = [ + (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics + ] + ax.text( + 0.01, + 0.5, + "Doc " + str(i - 1) + ": ", + verticalalignment="center", + fontsize=16, + color="black", + transform=ax.transAxes, + fontweight=700, + ) # Draw Rectange - topic_percs_sorted = sorted(topic_percs, key=lambda x: (x[1]), reverse=True) - ax.add_patch(Rectangle((0.0, 0.05), 0.99, 0.90, fill=None, alpha=1, - color=mycolors[topic_percs_sorted[0][0]], linewidth=2)) + topic_percs_sorted = sorted( + topic_percs, key=lambda x: (x[1]), reverse=True + ) + ax.add_patch( + Rectangle( + (0.0, 0.05), + 0.99, + 0.90, + fill=None, + alpha=1, + color=mycolors[topic_percs_sorted[0][0]], + linewidth=2, + ) + ) word_pos = 0.06 for j, (word, topics) in enumerate(word_dominanttopic): if j < 14: - ax.text(word_pos, 0.5, word, - horizontalalignment='left', - verticalalignment='center', - fontsize=16, color=mycolors[topics], - transform=ax.transAxes, fontweight=700) - word_pos += .009 * len(word) # to move the word for the next iter - ax.axis('off') - ax.text(word_pos, 0.5, '. . .', - horizontalalignment='left', - verticalalignment='center', - fontsize=16, color='black', - transform=ax.transAxes) + ax.text( + word_pos, + 0.5, + word, + horizontalalignment="left", + verticalalignment="center", + fontsize=16, + color=mycolors[topics], + transform=ax.transAxes, + fontweight=700, + ) + word_pos += 0.009 * len( + word + ) # to move the word for the next iter + ax.axis("off") + ax.text( + word_pos, + 0.5, + ". . .", + horizontalalignment="left", + verticalalignment="center", + fontsize=16, + color="black", + transform=ax.transAxes, + ) plt.subplots_adjust(wspace=0, hspace=0) - plt.suptitle('Sentence Topic Coloring for Documents: ' + str(start) + ' to ' + str(end-2), fontsize=22, y=0.95, fontweight=700) + plt.suptitle( + "Sentence Topic Coloring for Documents: " + + str(start) + + " to " + + str(end - 2), + fontsize=22, + y=0.95, + fontweight=700, + ) plt.tight_layout() plt.show() @@ -211,21 +290,101 @@ def cluster_chart(self, lda_model=None, corpus=None, n_topics=4, folder_path=Non topic_num = np.argmax(arr, axis=1) # tSNE Dimension Reduction - tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca') + tsne_model = TSNE( + n_components=2, verbose=1, random_state=0, angle=0.99, init="pca" + ) tsne_lda = tsne_model.fit_transform(arr) - # Plot plt.figure(figsize=(16, 10), dpi=160) for i in range(n_topics): - plt.scatter(tsne_lda[topic_num == i, 0], tsne_lda[topic_num == i, 1], label=str(i), alpha=0.5) - plt.title('t-SNE Clustering of Topics', fontsize=22) - plt.xlabel('t-SNE Dimension 1', fontsize=16) - plt.ylabel('t-SNE Dimension 2', fontsize=16) - plt.legend(title='Topic Number', loc='upper right') + plt.scatter( + tsne_lda[topic_num == i, 0], + tsne_lda[topic_num == i, 1], + label=str(i), + alpha=0.5, + ) + plt.title("t-SNE Clustering of Topics", fontsize=22) + plt.xlabel("t-SNE Dimension 1", fontsize=16) + plt.ylabel("t-SNE Dimension 2", fontsize=16) + plt.legend(title="Topic Number", loc="upper right") plt.show() # save if folder_path: plt.savefig(folder_path) plt.close() + def most_discussed_topics( + self, lda_model, dominant_topics, topic_percentages, folder_path=None + ): + + # Distribution of Dominant Topics in Each Document + df = pd.DataFrame(dominant_topics, columns=["Document_Id", "Dominant_Topic"]) + dominant_topic_in_each_doc = df.groupby("Dominant_Topic").size() + df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame( + name="count" + ).reset_index() + + # Total Topic Distribution by actual weight + topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages]) + df_topic_weightage_by_doc = ( + topic_weightage_by_doc.sum().to_frame(name="count").reset_index() + ) + + # Top 3 Keywords for each Topic + topic_top3words = [ + (i, topic) + for i, topics in lda_model.show_topics(formatted=False) + for j, (topic, wt) in enumerate(topics) + if j < 3 + ] + + df_top3words_stacked = pd.DataFrame( + topic_top3words, columns=["topic_id", "words"] + ) + df_top3words = df_top3words_stacked.groupby("topic_id").agg(", \n".join) + df_top3words.reset_index(level=0, inplace=True) + + # Plot + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True) + + # Topic Distribution by Dominant Topics + ax1.bar( + x="Dominant_Topic", + height="count", + data=df_dominant_topic_in_each_doc, + width=0.5, + color="firebrick", + ) + ax1.set_xticks( + range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()) + ) + tick_formatter = FuncFormatter( + lambda x, pos: "Topic " + + str(x) + + "\n" + + df_top3words.loc[df_top3words.topic_id == x, "words"].values[0] + ) + ax1.xaxis.set_major_formatter(tick_formatter) + ax1.set_title("Number of Documents by Dominant Topic", fontdict=dict(size=10)) + ax1.set_ylabel("Number of Documents") + ax1.set_ylim(0, 1000) + + # Topic Distribution by Topic Weights + ax2.bar( + x="index", + height="count", + data=df_topic_weightage_by_doc, + width=0.5, + color="steelblue", + ) + ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__())) + ax2.xaxis.set_major_formatter(tick_formatter) + ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10)) + + plt.show() + + # save + if folder_path: + plt.savefig(folder_path) + plt.close() diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 17b4c52..6c922a5 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -5,33 +5,41 @@ def corpus_fixture(): from pkg_resources import resource_filename from src.qrmine import ReadData + corpus = ReadData() - file_path = resource_filename('src.qrmine.resources', 'interview.txt') + file_path = resource_filename("src.qrmine.resources", "interview.txt") corpus.read_file(file_path) return corpus + # instannce of Qrmine as fixture @pytest.fixture def q(): from src.qrmine import Qrmine + _q = Qrmine() return _q + @pytest.fixture def cluster(): from src.qrmine import ClusterDocs + _cluster = ClusterDocs() return _cluster + # Ref: https://docs.pytest.org/en/latest/capture.html def test_generate_dict(corpus_fixture, capsys, q): from src.qrmine import Content + num = 10 all_interviews = Content(corpus_fixture.content) q.print_dict(all_interviews, num) captured = capsys.readouterr() print(captured.out) - assert 'code' in captured.out + assert "code" in captured.out + def test_generate_topics(corpus_fixture, capsys, q): q.content = corpus_fixture @@ -39,21 +47,24 @@ def test_generate_topics(corpus_fixture, capsys, q): q.print_topics() captured = capsys.readouterr() print(captured.out) - assert 'TOPIC' in captured.out + assert "TOPIC" in captured.out + def test_category_basket(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_basket()) captured = capsys.readouterr() print(captured.out) - assert 'theory' in captured.out + assert "theory" in captured.out + def test_category_association(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_association()) captured = capsys.readouterr() print(captured.out) - assert 'theory' in captured.out + assert "theory" in captured.out + def test_cluster_topics(corpus_fixture, capsys, cluster): cluster.documents = corpus_fixture.documents @@ -61,11 +72,11 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): cluster.print_clusters() captured = capsys.readouterr() print(captured.out) - assert 'Document' in captured.out + assert "Document" in captured.out cluster.print_topics() captured = capsys.readouterr() print(captured.out) - assert 'topic' in captured.out + assert "topic" in captured.out print(cluster.build_lda_model()) @@ -81,8 +92,8 @@ def test_cluster_topics(corpus_fixture, capsys, cluster): "Text", ] print(df_dominant_topic.head(10)) - assert 'Document_No' in df_dominant_topic.columns + assert "Document_No" in df_dominant_topic.columns df_sorted = cluster.most_representative_docs() print(df_sorted.head(10)) - assert 'Dominant_Topic' in df_sorted.columns \ No newline at end of file + assert "Dominant_Topic" in df_sorted.columns diff --git a/tests/test_visualize.py b/tests/test_visualize.py index ae35c39..32d5e4e 100644 --- a/tests/test_visualize.py +++ b/tests/test_visualize.py @@ -2,14 +2,17 @@ import pandas as pd from src.qrmine.visualize import QRVisualize + @pytest.fixture def v(): from pkg_resources import resource_filename + file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv") data = pd.read_csv(file_path) _v = QRVisualize(data) return _v + @pytest.fixture def topics(): return [ @@ -90,17 +93,22 @@ def topics(): ), ] + def test_frequency_distribution_of_words(v, capsys): - v.plot_frequency_distribution_of_words(v.data, folder_path='/tmp/frequency_distribution.png') + v.plot_frequency_distribution_of_words( + v.data, folder_path="/tmp/frequency_distribution.png" + ) captured = capsys.readouterr() print(captured.out) + def test_distribution_by_topic(v, capsys): - v.plot_distribution_by_topic(v.data, folder_path='/tmp/distribution_by_topic.png') + v.plot_distribution_by_topic(v.data, folder_path="/tmp/distribution_by_topic.png") captured = capsys.readouterr() print(captured.out) + def test_plot_wordcloud(v, topics, capsys): - v.plot_wordcloud(topics, folder_path='/tmp/wordcloud.png') + v.plot_wordcloud(topics, folder_path="/tmp/wordcloud.png") captured = capsys.readouterr() print(captured.out) From 16c2af07dccc87c049cbd047e75879f794a07097 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 22:01:26 +0000 Subject: [PATCH 21/35] chore: update dependencies in requirements and dev-requirements files; upgrade filelock and jinja2 versions --- dev-requirements.txt | 6 +- requirements.txt | 215 ++++++++++++++++++++++--------------------- 2 files changed, 112 insertions(+), 109 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index f36f95c..b5cc0fc 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -42,8 +42,9 @@ docutils==0.21.2 # via # recommonmark # sphinx -filelock==3.16.1 +filelock==3.18.0 # via + # -c requirements.txt # tox # virtualenv idna==3.10 @@ -54,7 +55,7 @@ imagesize==1.4.1 # via sphinx iniconfig==2.0.0 # via pytest -jinja2==3.1.4 +jinja2==3.1.6 # via # -c requirements.txt # sphinx @@ -141,6 +142,5 @@ virtualenv==20.27.1 # via tox wheel==0.45.0 # via - # -c requirements.txt # -r dev-requirements.in # pip-tools diff --git a/requirements.txt b/requirements.txt index 260d413..8326516 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,9 @@ # This file was autogenerated by uv via the following command: # uv pip compile setup.cfg -o requirements.txt --universal -absl-py==2.1.0 - # via - # tensorboard - # tensorflow -astunparse==1.6.3 - # via tensorflow blis==0.7.11 # via thinc cachetools==5.5.0 - # via - # google-auth - # textacy + # via textacy catalogue==2.0.10 # via # spacy @@ -28,7 +20,7 @@ click==8.1.7 # typer cloudpathlib==0.20.0 # via weasel -colorama==0.4.6 ; sys_platform == 'win32' or platform_system == 'Windows' +colorama==0.4.6 ; sys_platform == 'win32' # via # click # tqdm @@ -48,28 +40,16 @@ cymem==2.0.8 # thinc cytoolz==1.0.0 # via textacy -flatbuffers==24.3.25 - # via tensorflow +filelock==3.18.0 + # via torch floret==0.10.5 # via textacy fonttools==4.54.1 # via matplotlib -gast==0.4.0 - # via tensorflow -google-auth==2.36.0 - # via - # google-auth-oauthlib - # tensorboard -google-auth-oauthlib==1.0.0 - # via tensorboard -google-pasta==0.2.0 - # via tensorflow -grpcio==1.67.1 - # via - # tensorboard - # tensorflow -h5py==3.12.1 - # via tensorflow +fsspec==2025.3.2 + # via torch +gensim==4.3.3 + # via qrmine (setup.cfg) idna==3.10 # via requests imbalanced-learn==0.12.4 @@ -77,103 +57,142 @@ imbalanced-learn==0.12.4 jellyfish==1.1.0 # via textacy jinja2==3.1.6 - # via spacy + # via + # spacy + # torch joblib==1.4.2 # via # imbalanced-learn # mlxtend # scikit-learn # textacy -keras==2.13.1 - # via tensorflow kiwisolver==1.4.7 # via matplotlib langcodes==3.4.1 # via spacy language-data==1.2.0 # via langcodes -libclang==18.1.1 - # via tensorflow marisa-trie==1.2.1 # via language-data -markdown==3.7 - # via tensorboard markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 - # via - # jinja2 - # werkzeug + # via jinja2 matplotlib==3.9.2 # via # qrmine (setup.cfg) # mlxtend + # seaborn + # wordcloud mdurl==0.1.2 # via markdown-it-py mlxtend==0.23.2 # via qrmine (setup.cfg) +mpmath==1.3.0 + # via sympy murmurhash==1.0.10 # via # preshed # spacy # thinc networkx==3.4.2 - # via textacy -numpy==1.24.3 + # via + # textacy + # torch +numpy==1.24.3 ; python_full_version < '3.12' # via # blis # contourpy # floret - # h5py + # gensim # imbalanced-learn # matplotlib # mlxtend # pandas # scikit-learn # scipy + # seaborn # spacy - # tensorboard - # tensorflow # textacy # thinc + # wordcloud # xgboost -nvidia-nccl-cu12==2.23.4 ; platform_machine != 'aarch64' and platform_system == 'Linux' - # via xgboost -oauthlib==3.2.2 - # via requests-oauthlib -opt-einsum==3.4.0 - # via tensorflow +numpy==1.26.4 ; python_full_version >= '3.12' + # via + # blis + # contourpy + # floret + # gensim + # imbalanced-learn + # matplotlib + # mlxtend + # pandas + # scikit-learn + # scipy + # seaborn + # spacy + # textacy + # thinc + # wordcloud + # xgboost +nvidia-cublas-cu12==12.6.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.6.80 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-cuda-runtime-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-cudnn-cu12==9.5.1.17 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-cufile-cu12==1.11.1.6 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch +nvidia-nccl-cu12==2.26.2 ; platform_machine != 'aarch64' and sys_platform == 'linux' + # via + # torch + # xgboost +nvidia-nvjitlink-cu12==12.6.85 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch packaging==24.2 # via # matplotlib # spacy - # tensorflow # thinc # weasel -pandas==2.1.0 ; python_full_version >= '3.12' - # via - # qrmine (setup.cfg) - # mlxtend -pandas==2.2.3 ; python_full_version < '3.12' +pandas==2.2.3 # via # qrmine (setup.cfg) # mlxtend + # seaborn pillow==11.0.0 - # via matplotlib + # via + # matplotlib + # wordcloud preshed==3.0.9 # via # spacy # thinc -protobuf==4.25.5 - # via - # tensorboard - # tensorflow -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth pydantic==1.10.19 # via # confection @@ -184,6 +203,8 @@ pygments==2.18.0 # via rich pyparsing==3.2.0 # via matplotlib +pypdf==5.4.0 + # via qrmine (setup.cfg) pyphen==0.17.0 # via textacy python-dateutil==2.9.0.post0 @@ -194,48 +215,44 @@ pytz==2024.2 # via pandas requests==2.32.3 # via - # requests-oauthlib + # qrmine (setup.cfg) # spacy - # tensorboard # textacy # vadersentiment # weasel -requests-oauthlib==2.0.0 - # via google-auth-oauthlib rich==13.9.4 # via typer -rsa==4.9 - # via google-auth scikit-learn==1.5.2 # via # qrmine (setup.cfg) # imbalanced-learn # mlxtend # textacy -scipy==1.14.1 +scipy==1.13.1 # via + # gensim # imbalanced-learn # mlxtend # scikit-learn # textacy # xgboost +seaborn==0.13.2 + # via qrmine (setup.cfg) setuptools==75.3.0 # via # marisa-trie # spacy - # tensorboard - # tensorflow # thinc + # torch + # triton shellingham==1.5.4 # via typer six==1.16.0 - # via - # astunparse - # google-pasta - # python-dateutil - # tensorflow + # via python-dateutil smart-open==7.0.5 - # via weasel + # via + # gensim + # weasel spacy==3.7.5 # via # qrmine (setup.cfg) @@ -250,20 +267,8 @@ srsly==2.4.8 # spacy # thinc # weasel -tensorboard==2.13.0 - # via tensorflow -tensorboard-data-server==0.7.2 - # via tensorboard -tensorflow==2.13.1 - # via qrmine (setup.cfg) -tensorflow-estimator==2.13.0 - # via tensorflow -tensorflow-io-gcs-filesystem==0.31.0 - # via - # qrmine (setup.cfg) - # tensorflow -termcolor==2.5.0 - # via tensorflow +sympy==1.14.0 + # via torch textacy==0.13.0 # via qrmine (setup.cfg) thinc==8.2.5 @@ -274,18 +279,22 @@ threadpoolctl==3.5.0 # scikit-learn toolz==1.0.0 # via cytoolz +torch==2.7.0 + # via qrmine (setup.cfg) tqdm==4.67.0 # via # spacy # textacy +triton==3.3.0 ; platform_machine == 'x86_64' and sys_platform == 'linux' + # via torch typer==0.13.0 # via # spacy # weasel -typing-extensions==4.5.0 +typing-extensions==4.13.2 # via # pydantic - # tensorflow + # torch # typer tzdata==2024.2 # via pandas @@ -300,15 +309,9 @@ wasabi==1.1.3 # weasel weasel==0.4.1 # via spacy -werkzeug==3.1.3 - # via tensorboard -wheel==0.45.0 - # via - # astunparse - # tensorboard +wordcloud==1.9.4 + # via qrmine (setup.cfg) wrapt==1.16.0 - # via - # smart-open - # tensorflow + # via smart-open xgboost==2.1.2 # via qrmine (setup.cfg) From bd7e454cde7ce4255a62627cfae97d6b033aa0e7 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 19:28:05 -0500 Subject: [PATCH 22/35] Update pr.yml --- .github/workflows/pr.yml | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index f742724..25df3e7 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -13,27 +13,24 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: ["3.11"] os: [ubuntu-latest, macos-13, windows-latest] runs-on: ${{ matrix.os }} timeout-minutes: 20 steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + - name: Install uv + uses: astral-sh/setup-uv@v5 + - name: "Set up Python" + uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} - cache: 'pip' # caching pip dependencies + python-version-file: "pyproject.toml" - name: run on mac if: startsWith(matrix.os, 'mac') run: | brew install libomp - - name: Install dependencies + - name: Install the project run: | - python -m pip install --upgrade pip - pip install -r requirements.txt + uv sync --locked --all-extras --dev python -m spacy download en_core_web_sm - - name: Test with pytest - run: | - pip install pytest - pytest + - name: Run tests + run: uv run pytest tests From 20d44cb200e5aeba65f447337ca71cded3315f18 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 00:31:50 +0000 Subject: [PATCH 23/35] fix: specify exact torch version and remove deprecated setup.cfg; update tox.ini to eliminate redundant dependencies --- pyproject.toml | 18 +++++- setup.cfg | 159 ------------------------------------------------- tox.ini | 6 -- 3 files changed, 15 insertions(+), 168 deletions(-) delete mode 100644 setup.cfg diff --git a/pyproject.toml b/pyproject.toml index 03ef32a..d63ec84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "mlxtend", "spacy", "textacy", - "torch", + "torch==2.2.2", "pypdf", "requests", "gensim", @@ -60,8 +60,6 @@ Documentation = "https://arxiv.org/abs/2003.13519" # Add here additional requirements for extra features, to install with: # `pip install qrmine[PDF]` like: # PDF = ReportLab; RXP -gpu = ["torch[gpu]==2.1.1"] -cpu = ["torch==2.1.1"] # Add here test requirements (semicolon/line-separated) testing = [ "setuptools", @@ -114,6 +112,19 @@ norecursedirs = [ ".tox", ] +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu" }, +] +torchvision = [ + { index = "pytorch-cpu" }, +] + [tool.aliases] release = "sdist bdist_wheel upload" @@ -158,3 +169,4 @@ package = "qrmine" # Read more about the various options under: # https://setuptools.pypa.io/en/latest/userguide/declarative_config.html # https://setuptools.pypa.io/en/latest/references/keywords.html + diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index c85767c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,159 +0,0 @@ -# This file is used to configure your project. -# Read more about the various options under: -# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html -# https://setuptools.pypa.io/en/latest/references/keywords.html - -[metadata] -name = qrmine -description = Qualitative Research support tools in Python! -author = beapen -author_email = github@gulfdoctor.net -license = GPL-3.0-only -# license_files = LICENSE.txt -# long_description = file: README.rst -# long_description_content_type = text/x-rst; charset=UTF-8 -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/dermatologist/nlp-qrmine -# Add here related links, for example: -project_urls = - Documentation = https://arxiv.org/abs/2003.13519 -# Source = https://github.com/pyscaffold/pyscaffold/ -# Changelog = https://pyscaffold.org/en/latest/changelog.html -# Tracker = https://github.com/pyscaffold/pyscaffold/issues -# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold -# Download = https://pypi.org/project/PyScaffold/#files -# Twitter = https://twitter.com/PyScaffold - -# Change if running only on Windows, Mac or Linux (comma-separated) -platforms = any - -# Add here all kinds of additional classifiers as defined under -# https://pypi.org/classifiers/ -classifiers = - Intended Audience :: Science/Research - Development Status :: 4 - Beta - Operating System :: OS Independent - Programming Language :: Python :: 3.11 - Topic :: Scientific/Engineering :: Information Analysis - - -[options] -zip_safe = False -packages = find_namespace: -include_package_data = True -package_dir = - =src - -# Require a min/specific Python version (comma-separated conditions) -# python_requires = >=3.8 - -# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. -# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in -# new major versions. This works if the required packages follow Semantic Versioning. -# For more information, check out https://semver.org/. -install_requires = - importlib-metadata; python_version<"3.8" - pandas - matplotlib - click - scikit-learn - imbalanced-learn - vaderSentiment - xgboost - mlxtend - spacy - textacy - torch - pypdf - requests - gensim - seaborn - wordcloud - -[options.packages.find] -where = src -exclude = - tests - -[options.extras_require] -# Add here additional requirements for extra features, to install with: -# `pip install qrmine[PDF]` like: -# PDF = ReportLab; RXP - gpu = - torch[gpu]==2.1.1 - cpu = - torch==2.1.1 -# Add here test requirements (semicolon/line-separated) -testing = - setuptools - pytest - pytest-cov - -[options.entry_points] -# Add here console scripts like: -# console_scripts = -# script_name = qrmine.module:function -# For example: -# console_scripts = -# fibonacci = qrmine.skeleton:run -# And any other entry points, for example: -# pyscaffold.cli = -# awesome = pyscaffoldext.awesome.extension:AwesomeExtension -console_scripts = - qrmine = qrmine.main:main_routine - -[tool:pytest] -# Specify command line options as you would do when invoking pytest directly. -# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml -# in order to write a coverage file that can be read by Jenkins. -# CAUTION: --cov flags may prohibit setting breakpoints while debugging. -# Comment those flags to avoid this pytest issue. -addopts = - --verbose -norecursedirs = - dist - build - .tox - -[aliases] -release = sdist bdist_wheel upload - -[bdist_wheel] -# Use this option if your package is pure-python -universal = 1 - -[build_sphinx] -source_dir = docs -build_dir = docs/_build - -testpaths = tests -# Use pytest markers to select/deselect specific tests -# markers = -# slow: mark tests as slow (deselect with '-m "not slow"') -# system: mark end-to-end system tests - -[devpi:upload] -# Options for the devpi: PyPI server and packaging tool -# VCS export must be deactivated since we are using setuptools-scm -no_vcs = 1 -formats = bdist_wheel - -[flake8] -# Some sane defaults for the code style checker flake8 -max_line_length = 88 -extend_ignore = E203, W503 -# ^ Black-compatible -# E203 and W503 have edge cases handled by black -exclude = - .tox - build - dist - .eggs - docs/conf.py - -[pyscaffold] -# PyScaffold's parameters when the project was created. -# This will be used when updating. Do not change! -version = 4.6 -package = qrmine diff --git a/tox.ini b/tox.ini index 3eb707d..dbb293d 100644 --- a/tox.ini +++ b/tox.ini @@ -8,9 +8,6 @@ envlist = py311, integration [testenv] setenv = TOXINIDIR = {toxinidir} -deps = - -rrequirements.txt - -rdev-requirements.txt commands = python -m spacy download en_core_web_sm py.test {posargs} @@ -20,9 +17,6 @@ extras = [testenv:integration] setenv = TOXINIDIR = {toxinidir} -deps = - -rrequirements.txt - -rdev-requirements.txt commands = python -m spacy download en_core_web_sm python qrminer.py \ No newline at end of file From 2c6c174c066c4e459d186405023f7f99050b2b43 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 19:35:08 -0500 Subject: [PATCH 24/35] Update pr.yml --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 25df3e7..fd1547a 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -30,7 +30,7 @@ jobs: brew install libomp - name: Install the project run: | - uv sync --locked --all-extras --dev + uv sync --all-extras --dev python -m spacy download en_core_web_sm - name: Run tests run: uv run pytest tests From a551f368d251865112768f0b4062c887b0fbd703 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 00:41:21 +0000 Subject: [PATCH 25/35] fix: update python_requires and add missing entry point for qrmine --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d63ec84..e65077e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Information Analysis", ] +python_requires = ">=3.11, <3.12" dependencies = [ 'importlib-metadata; python_version<"3.8"', "pandas", @@ -77,6 +78,7 @@ testing = [ # And any other entry points, for example: # pyscaffold.cli = # awesome = pyscaffoldext.awesome.extension:AwesomeExtension +qrmine = "qrmine.main:main_routine" [project.scripts] qrmine = "qrmine.main:main_routine" From 3efce4f360908cf410e3e38f9e184a73e88a8487 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 00:43:06 +0000 Subject: [PATCH 26/35] fix: update entry point syntax for qrmine in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e65077e..a26c85b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ testing = [ # And any other entry points, for example: # pyscaffold.cli = # awesome = pyscaffoldext.awesome.extension:AwesomeExtension -qrmine = "qrmine.main:main_routine" +qrmine = qrmine.main:main_routine [project.scripts] qrmine = "qrmine.main:main_routine" From dd0362258a69dfbc6f94a3f1e3df1c121b7495fe Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 00:45:44 +0000 Subject: [PATCH 27/35] fix: correct entry point syntax for console_scripts in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a26c85b..2a18cf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ testing = [ # And any other entry points, for example: # pyscaffold.cli = # awesome = pyscaffoldext.awesome.extension:AwesomeExtension -qrmine = qrmine.main:main_routine +console_scripts = qrmine = qrmine.main:main_routine [project.scripts] qrmine = "qrmine.main:main_routine" From 2081eeaff32749fb1a12914e5f6262e17de73112 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 00:47:07 +0000 Subject: [PATCH 28/35] fix: remove incorrect console_scripts entry from pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2a18cf0..666ac4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,6 @@ testing = [ # And any other entry points, for example: # pyscaffold.cli = # awesome = pyscaffoldext.awesome.extension:AwesomeExtension -console_scripts = qrmine = qrmine.main:main_routine [project.scripts] qrmine = "qrmine.main:main_routine" From d4f3553eff8b3600b7fe91206da2b27717222acf Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 00:48:39 +0000 Subject: [PATCH 29/35] fix: correct key name for Python version requirement in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 666ac4e..fa7d5ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Information Analysis", ] -python_requires = ">=3.11, <3.12" +requires-python = ">=3.11, <3.12" dependencies = [ 'importlib-metadata; python_version<"3.8"', "pandas", From b56923195f8c34990a2118c8a39886714c86b53b Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 19:52:00 -0500 Subject: [PATCH 30/35] Update pr.yml --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index fd1547a..b7d5f11 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -31,6 +31,6 @@ jobs: - name: Install the project run: | uv sync --all-extras --dev - python -m spacy download en_core_web_sm + uv run python -m spacy download en_core_web_sm - name: Run tests run: uv run pytest tests From 97c80d9ef5fc89a726b36fa9e1c4a1609b20eae5 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 00:56:02 +0000 Subject: [PATCH 31/35] fix: add missing dependencies for build system in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fa7d5ed..ffebd89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.2"] +requires = ["setuptools>=61.2", "wheel", "pip"] build-backend = "setuptools.build_meta" [project] From 8100027d1b3c928c1f4c056c8a985bc57d0e0f1a Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 19:58:56 -0500 Subject: [PATCH 32/35] Update pr.yml --- .github/workflows/pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index b7d5f11..082c154 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -31,6 +31,7 @@ jobs: - name: Install the project run: | uv sync --all-extras --dev + uv pip install pip uv run python -m spacy download en_core_web_sm - name: Run tests run: uv run pytest tests From 7eb9df297fe93a83bf079d33e096fb207ff2022c Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:03:07 -0500 Subject: [PATCH 33/35] Update pr.yml --- .github/workflows/pr.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 082c154..0b69550 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -20,14 +20,16 @@ jobs: - uses: actions/checkout@v4 - name: Install uv uses: astral-sh/setup-uv@v5 + with: + enable-cache: true - name: "Set up Python" uses: actions/setup-python@v5 with: python-version-file: "pyproject.toml" - - name: run on mac - if: startsWith(matrix.os, 'mac') - run: | - brew install libomp + # - name: run on mac + # if: startsWith(matrix.os, 'mac') + # run: | + # brew install libomp - name: Install the project run: | uv sync --all-extras --dev From 3965bb3e9282a8b445b9cbd5e8dd26b254e1eae1 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Wed, 30 Apr 2025 20:06:54 -0500 Subject: [PATCH 34/35] Update pr.yml --- .github/workflows/pr.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 0b69550..3693dc2 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -26,10 +26,10 @@ jobs: uses: actions/setup-python@v5 with: python-version-file: "pyproject.toml" - # - name: run on mac - # if: startsWith(matrix.os, 'mac') - # run: | - # brew install libomp + - name: run on mac + if: startsWith(matrix.os, 'mac') + run: | + brew install libomp - name: Install the project run: | uv sync --all-extras --dev From b811c6a023735261dfda6aaa7107127c0c6dad13 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Thu, 1 May 2025 01:32:10 +0000 Subject: [PATCH 35/35] refactor: remove unnecessary folder_path arguments from visualization tests --- dev-requirements.in | 11 -- dev-requirements.txt | 146 ------------------ notes/new-process.md | 34 +++++ pyproject.toml | 14 ++ requirements.txt | 317 ---------------------------------------- tests/test_visualize.py | 6 +- 6 files changed, 51 insertions(+), 477 deletions(-) delete mode 100644 dev-requirements.in delete mode 100644 dev-requirements.txt create mode 100644 notes/new-process.md delete mode 100644 requirements.txt diff --git a/dev-requirements.in b/dev-requirements.in deleted file mode 100644 index 2b56355..0000000 --- a/dev-requirements.in +++ /dev/null @@ -1,11 +0,0 @@ -# dev-requirements.in --c requirements.txt -pytest-cov -pytest -recommonmark -sphinx>=3.2.1 -setuptools -setuptools_scm -wheel>=0.37.0 # conflicts with dependency of tensorflow -tox -pip-tools \ No newline at end of file diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index b5cc0fc..0000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,146 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile dev-requirements.in -o dev-requirements.txt --universal -alabaster==1.0.0 - # via sphinx -babel==2.16.0 - # via sphinx -build==1.2.2.post1 - # via pip-tools -cachetools==5.5.0 - # via - # -c requirements.txt - # tox -certifi==2024.8.30 - # via - # -c requirements.txt - # requests -chardet==5.2.0 - # via tox -charset-normalizer==3.4.0 - # via - # -c requirements.txt - # requests -click==8.1.7 - # via - # -c requirements.txt - # pip-tools -colorama==0.4.6 - # via - # -c requirements.txt - # build - # click - # pytest - # sphinx - # tox -commonmark==0.9.1 - # via recommonmark -coverage==7.6.4 - # via pytest-cov -distlib==0.3.9 - # via virtualenv -docutils==0.21.2 - # via - # recommonmark - # sphinx -filelock==3.18.0 - # via - # -c requirements.txt - # tox - # virtualenv -idna==3.10 - # via - # -c requirements.txt - # requests -imagesize==1.4.1 - # via sphinx -iniconfig==2.0.0 - # via pytest -jinja2==3.1.6 - # via - # -c requirements.txt - # sphinx -markupsafe==3.0.2 - # via - # -c requirements.txt - # jinja2 -packaging==24.2 - # via - # -c requirements.txt - # build - # pyproject-api - # pytest - # setuptools-scm - # sphinx - # tox -pip==24.3.1 - # via pip-tools -pip-tools==7.4.1 - # via -r dev-requirements.in -platformdirs==4.3.6 - # via - # tox - # virtualenv -pluggy==1.5.0 - # via - # pytest - # tox -pygments==2.18.0 - # via - # -c requirements.txt - # sphinx -pyproject-api==1.8.0 - # via tox -pyproject-hooks==1.2.0 - # via - # build - # pip-tools -pytest==8.3.3 - # via - # -r dev-requirements.in - # pytest-cov -pytest-cov==6.0.0 - # via -r dev-requirements.in -recommonmark==0.7.1 - # via -r dev-requirements.in -requests==2.32.3 - # via - # -c requirements.txt - # sphinx -setuptools==75.3.0 - # via - # -c requirements.txt - # -r dev-requirements.in - # pip-tools - # setuptools-scm -setuptools-scm==8.1.0 - # via -r dev-requirements.in -snowballstemmer==2.2.0 - # via sphinx -sphinx==8.1.3 - # via - # -r dev-requirements.in - # recommonmark -sphinxcontrib-applehelp==2.0.0 - # via sphinx -sphinxcontrib-devhelp==2.0.0 - # via sphinx -sphinxcontrib-htmlhelp==2.1.0 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==2.0.0 - # via sphinx -sphinxcontrib-serializinghtml==2.0.0 - # via sphinx -tox==4.23.2 - # via -r dev-requirements.in -urllib3==2.2.3 - # via - # -c requirements.txt - # requests -virtualenv==20.27.1 - # via tox -wheel==0.45.0 - # via - # -r dev-requirements.in - # pip-tools diff --git a/notes/new-process.md b/notes/new-process.md new file mode 100644 index 0000000..1ead749 --- /dev/null +++ b/notes/new-process.md @@ -0,0 +1,34 @@ +conda install conda-forge::uv +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml + +delete setup.cpg +delete requirements.txt, dev-requirements.txt, dev-requirements.in +remove deps from tox.ini + +uv pip install -e . +see pr.yml for GitHub actions +see pyproject.toml for pytorch cpu install +uv pip install -e . + +uv sync --all-extras --dev +uv pip install pip +uv run python -m spacy download en_core_web_sm + +pyproject.toml +requires = ["setuptools>=61.2", "wheel", "pip"] + +dev = [ + "setuptools", + "setuptools_scm", + "pytest", + "pytest-cov", + "tox", + "black", + "recommonmark", + "sphinx", + "wheel", + "twine", + "tox", +] + diff --git a/pyproject.toml b/pyproject.toml index ffebd89..9fc3688 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,20 @@ testing = [ "pytest-cov", ] +dev = [ + "setuptools", + "setuptools_scm", + "pytest", + "pytest-cov", + "tox", + "black", + "recommonmark", + "sphinx", + "wheel", + "twine", + "tox", +] + [project.entry-points] # Add here console scripts like: # console_scripts = diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8326516..0000000 --- a/requirements.txt +++ /dev/null @@ -1,317 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile setup.cfg -o requirements.txt --universal -blis==0.7.11 - # via thinc -cachetools==5.5.0 - # via textacy -catalogue==2.0.10 - # via - # spacy - # srsly - # textacy - # thinc -certifi==2024.8.30 - # via requests -charset-normalizer==3.4.0 - # via requests -click==8.1.7 - # via - # qrmine (setup.cfg) - # typer -cloudpathlib==0.20.0 - # via weasel -colorama==0.4.6 ; sys_platform == 'win32' - # via - # click - # tqdm - # wasabi -confection==0.1.5 - # via - # thinc - # weasel -contourpy==1.3.0 - # via matplotlib -cycler==0.12.1 - # via matplotlib -cymem==2.0.8 - # via - # preshed - # spacy - # thinc -cytoolz==1.0.0 - # via textacy -filelock==3.18.0 - # via torch -floret==0.10.5 - # via textacy -fonttools==4.54.1 - # via matplotlib -fsspec==2025.3.2 - # via torch -gensim==4.3.3 - # via qrmine (setup.cfg) -idna==3.10 - # via requests -imbalanced-learn==0.12.4 - # via qrmine (setup.cfg) -jellyfish==1.1.0 - # via textacy -jinja2==3.1.6 - # via - # spacy - # torch -joblib==1.4.2 - # via - # imbalanced-learn - # mlxtend - # scikit-learn - # textacy -kiwisolver==1.4.7 - # via matplotlib -langcodes==3.4.1 - # via spacy -language-data==1.2.0 - # via langcodes -marisa-trie==1.2.1 - # via language-data -markdown-it-py==3.0.0 - # via rich -markupsafe==3.0.2 - # via jinja2 -matplotlib==3.9.2 - # via - # qrmine (setup.cfg) - # mlxtend - # seaborn - # wordcloud -mdurl==0.1.2 - # via markdown-it-py -mlxtend==0.23.2 - # via qrmine (setup.cfg) -mpmath==1.3.0 - # via sympy -murmurhash==1.0.10 - # via - # preshed - # spacy - # thinc -networkx==3.4.2 - # via - # textacy - # torch -numpy==1.24.3 ; python_full_version < '3.12' - # via - # blis - # contourpy - # floret - # gensim - # imbalanced-learn - # matplotlib - # mlxtend - # pandas - # scikit-learn - # scipy - # seaborn - # spacy - # textacy - # thinc - # wordcloud - # xgboost -numpy==1.26.4 ; python_full_version >= '3.12' - # via - # blis - # contourpy - # floret - # gensim - # imbalanced-learn - # matplotlib - # mlxtend - # pandas - # scikit-learn - # scipy - # seaborn - # spacy - # textacy - # thinc - # wordcloud - # xgboost -nvidia-cublas-cu12==12.6.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 - # torch -nvidia-cuda-cupti-cu12==12.6.80 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-cuda-runtime-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-cudnn-cu12==9.5.1.17 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-cufile-cu12==1.11.1.6 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via - # nvidia-cusolver-cu12 - # torch -nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -nvidia-nccl-cu12==2.26.2 ; platform_machine != 'aarch64' and sys_platform == 'linux' - # via - # torch - # xgboost -nvidia-nvjitlink-cu12==12.6.85 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 - # torch -nvidia-nvtx-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -packaging==24.2 - # via - # matplotlib - # spacy - # thinc - # weasel -pandas==2.2.3 - # via - # qrmine (setup.cfg) - # mlxtend - # seaborn -pillow==11.0.0 - # via - # matplotlib - # wordcloud -preshed==3.0.9 - # via - # spacy - # thinc -pydantic==1.10.19 - # via - # confection - # spacy - # thinc - # weasel -pygments==2.18.0 - # via rich -pyparsing==3.2.0 - # via matplotlib -pypdf==5.4.0 - # via qrmine (setup.cfg) -pyphen==0.17.0 - # via textacy -python-dateutil==2.9.0.post0 - # via - # matplotlib - # pandas -pytz==2024.2 - # via pandas -requests==2.32.3 - # via - # qrmine (setup.cfg) - # spacy - # textacy - # vadersentiment - # weasel -rich==13.9.4 - # via typer -scikit-learn==1.5.2 - # via - # qrmine (setup.cfg) - # imbalanced-learn - # mlxtend - # textacy -scipy==1.13.1 - # via - # gensim - # imbalanced-learn - # mlxtend - # scikit-learn - # textacy - # xgboost -seaborn==0.13.2 - # via qrmine (setup.cfg) -setuptools==75.3.0 - # via - # marisa-trie - # spacy - # thinc - # torch - # triton -shellingham==1.5.4 - # via typer -six==1.16.0 - # via python-dateutil -smart-open==7.0.5 - # via - # gensim - # weasel -spacy==3.7.5 - # via - # qrmine (setup.cfg) - # textacy -spacy-legacy==3.0.12 - # via spacy -spacy-loggers==1.0.5 - # via spacy -srsly==2.4.8 - # via - # confection - # spacy - # thinc - # weasel -sympy==1.14.0 - # via torch -textacy==0.13.0 - # via qrmine (setup.cfg) -thinc==8.2.5 - # via spacy -threadpoolctl==3.5.0 - # via - # imbalanced-learn - # scikit-learn -toolz==1.0.0 - # via cytoolz -torch==2.7.0 - # via qrmine (setup.cfg) -tqdm==4.67.0 - # via - # spacy - # textacy -triton==3.3.0 ; platform_machine == 'x86_64' and sys_platform == 'linux' - # via torch -typer==0.13.0 - # via - # spacy - # weasel -typing-extensions==4.13.2 - # via - # pydantic - # torch - # typer -tzdata==2024.2 - # via pandas -urllib3==2.2.3 - # via requests -vadersentiment==3.3.2 - # via qrmine (setup.cfg) -wasabi==1.1.3 - # via - # spacy - # thinc - # weasel -weasel==0.4.1 - # via spacy -wordcloud==1.9.4 - # via qrmine (setup.cfg) -wrapt==1.16.0 - # via smart-open -xgboost==2.1.2 - # via qrmine (setup.cfg) diff --git a/tests/test_visualize.py b/tests/test_visualize.py index 32d5e4e..41f7145 100644 --- a/tests/test_visualize.py +++ b/tests/test_visualize.py @@ -96,19 +96,19 @@ def topics(): def test_frequency_distribution_of_words(v, capsys): v.plot_frequency_distribution_of_words( - v.data, folder_path="/tmp/frequency_distribution.png" + v.data ) captured = capsys.readouterr() print(captured.out) def test_distribution_by_topic(v, capsys): - v.plot_distribution_by_topic(v.data, folder_path="/tmp/distribution_by_topic.png") + v.plot_distribution_by_topic(v.data) captured = capsys.readouterr() print(captured.out) def test_plot_wordcloud(v, topics, capsys): - v.plot_wordcloud(topics, folder_path="/tmp/wordcloud.png") + v.plot_wordcloud(topics) captured = capsys.readouterr() print(captured.out)