From ca70b313b7fb0b914bfcd09ea16726081e2cf49d Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Sun, 4 May 2025 20:09:15 -0500 Subject: [PATCH 1/5] Merge branch 'feature/cluster-2' --- .github/workflows/docs.yml | 25 +- .github/workflows/pr.yml | 28 +- .github/workflows/publish.yml | 23 +- .github/workflows/tox.yml | 21 +- .gitignore | 1 + README.md | 58 ++- dev-requirements.in | 11 - dev-requirements.txt | 146 ------- notes/conda.md | 12 + notes/new-process.md | 35 ++ notes/pip-tools.md | 5 +- pyproject.toml | 195 ++++++++- requirements.txt | 314 -------------- setup.cfg | 152 ------- src/qrmine/__init__.py | 2 + src/qrmine/cluster.py | 273 +++++++++++++ src/qrmine/content.py | 92 +++-- src/qrmine/main.py | 282 +++++++++---- src/qrmine/mlqrmine.py | 119 ++++-- src/qrmine/nlp_qrmine.py | 23 +- src/qrmine/readfiles.py | 124 ++++-- src/qrmine/resources/df_dominant_topic.csv | 12 + src/qrmine/utils.py | 40 ++ src/qrmine/visualize.py | 450 +++++++++++++++++++++ test.py | 33 ++ tests/test_nlp.py | 71 +++- tests/test_num.py | 4 +- tests/test_readfiles.py | 4 +- tests/test_visualize.py | 114 ++++++ tox.ini | 71 +++- 30 files changed, 1844 insertions(+), 896 deletions(-) delete mode 100644 dev-requirements.in delete mode 100644 dev-requirements.txt create mode 100644 notes/conda.md create mode 100644 notes/new-process.md delete mode 100644 requirements.txt delete mode 100644 setup.cfg create mode 100644 src/qrmine/cluster.py create mode 100644 src/qrmine/resources/df_dominant_topic.csv create mode 100644 src/qrmine/utils.py create mode 100644 src/qrmine/visualize.py create mode 100644 test.py create mode 100644 tests/test_visualize.py diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 7b25612..be1ae17 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -11,21 +11,26 @@ jobs: timeout-minutes: 15 steps: - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v4 + - name: Install uv + uses: astral-sh/setup-uv@v5 with: - python-version: '3.11' - - name: Install dependencies + enable-cache: true + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install the project run: | - python -m pip install --upgrade pip - pip install -r requirements.txt -r dev-requirements.txt - python -m spacy download en_core_web_sm + uv sync --all-extras --dev + uv pip install pip + uv run python -m spacy download en_core_web_sm - name: Create docs run: | - make -C docs/ html + uv run python -m sphinx -b html docs/ docs/_build/html cp docs/_config.yml docs/_build/html/_config.yml - name: Deploy Docs ๐Ÿš€ - uses: JamesIves/github-pages-deploy-action@v4.2.5 + uses: JamesIves/github-pages-deploy-action@v4 with: branch: gh-pages # The branch the action should deploy to. - folder: docs/_build/html # The folder the action should deploy. \ No newline at end of file + folder: docs/_build/html # The folder the action should deploy. diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index f742724..0a96d34 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -1,4 +1,4 @@ -name: Pytest on PR +name: Pytest using UV on PR on: push: branches: @@ -13,27 +13,27 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: ["3.11"] os: [ubuntu-latest, macos-13, windows-latest] runs-on: ${{ matrix.os }} timeout-minutes: 20 steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + - name: Install uv + uses: astral-sh/setup-uv@v5 with: - python-version: ${{ matrix.python-version }} - cache: 'pip' # caching pip dependencies + enable-cache: true + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" - name: run on mac if: startsWith(matrix.os, 'mac') run: | brew install libomp - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - python -m spacy download en_core_web_sm - - name: Test with pytest + - name: Install the project run: | - pip install pytest - pytest + uv sync --all-extras --dev + uv pip install pip + uv run python -m spacy download en_core_web_sm + - name: Run tests + run: uv run pytest tests diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 9018711..6a2859f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -11,20 +11,25 @@ jobs: timeout-minutes: 20 steps: - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5.1.1 + - name: Install uv + uses: astral-sh/setup-uv@v5 with: - python-version: '3.11' - - name: Install dependencies + enable-cache: true + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + - name: Install the project run: | - python -m pip install --upgrade pip - pip install -r dev-requirements.txt + uv sync --all-extras --dev + uv pip install pip + uv run python -m spacy download en_core_web_sm - name: Build and publish run: | - python setup.py bdist_wheel + uv run python setup.py bdist_wheel - name: Publish distribution ๐Ÿ“ฆ to PyPI if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml index 2b436ed..cb188de 100644 --- a/.github/workflows/tox.yml +++ b/.github/workflows/tox.yml @@ -17,15 +17,20 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5.1.1 + - name: Install uv + uses: astral-sh/setup-uv@v5 with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies + enable-cache: true + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install the project run: | - python -m pip install --upgrade pip - pip install -r dev-requirements.txt -r requirements.txt - python -m spacy download en_core_web_sm + uv sync --all-extras --dev + uv pip install pip + uv run python -m spacy download en_core_web_sm - name: Test with tox run: | - tox \ No newline at end of file + uv run tox diff --git a/.gitignore b/.gitignore index 64049e7..c29a2a9 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ __pycache__/* .idea .venv conda +uv.lock # Package files *.egg diff --git a/README.md b/README.md index 62e9d85..95e703a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# :flashlight: QRMine +# ๐Ÿ” QRMine */หˆkรคrmฤซn/* [![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/)[![PyPI download total](https://img.shields.io/pypi/dm/qrmine.svg)](https://pypi.python.org/pypi/qrmine/) @@ -6,11 +6,17 @@ ![GitHub tag (latest by date)](https://img.shields.io/github/v/tag/dermatologist/nlp-qrmine) [![Documentation](https://badgen.net/badge/icon/documentation?icon=libraries&label)](https://dermatologist.github.io/nlp-qrmine/) -QRMine is a suite of qualitative research (QR) data mining tools in Python using Natural Language Processing (NLP) and Machine Learning (ML). QRMine is work in progress. [Read More..](https://nuchange.ca/2017/09/grounded-theory-qualitative-research-python.html) +Qualitative research involves the collection and analysis of textual data, such as interview transcripts, open-ended survey responses, and field notes. It is often used in social sciences, humanities, and health research to explore complex phenomena and understand human experiences. In addition to textual data, qualitative researchers may also collect quantitative data, such as survey responses or demographic information, to complement their qualitative findings. -## What it does +Qualitative research is often characterized by its inductive approach, where researchers aim to generate theories or concepts from the data rather than testing pre-existing hypotheses. This process is known as Grounded Theory, which emphasizes the importance of data-driven analysis and theory development. -### NLP +QRMine is a Python package for qualitative research and triangulation of textual and numeric data in Grounded Theory. It provides tools for Natural Language Processing (NLP) and Machine Learning (ML) to analyze qualitative data, such as interview transcripts, and quantitative data, such as survey responses for theorizing. + +Version 4.0 is a major update with new features and bug fixes. It moves some of the ML dependencies to an optional install. Version 4.0 is a prelude to version 5.0 that will introduce large language models (LLMs) for qualitative research. + +## โœจ Features + +### ๐Ÿ”ง NLP * Lists common categories for open coding. * Create a coding dictionary with categories, properties and dimensions. * Topic modelling. @@ -18,9 +24,11 @@ QRMine is a suite of qualitative research (QR) data mining tools in Python using * Compare two documents/interviews. * Select documents/interviews by sentiment, category or title for further analysis. * Sentiment analysis +* Clusters documents and creates visualizations. +* Generate (non LLM) summary of documents/interviews. -### ML +### ๐Ÿง  ML * Accuracy of a neural network model trained using the data * Confusion matrix from an support vector machine classifier * K nearest neighbours of a given record @@ -28,25 +36,29 @@ QRMine is a suite of qualitative research (QR) data mining tools in Python using * Principal Component Analysis (PCA) * Association rules -## How to install +## ๐Ÿ› ๏ธ How to install -* Requires Python 3.11 and a CPU that support AVX instructions +* Requires Python 3.11 ```text -pip install uv -uv pip install qrmine +pip install qrmine python -m spacy download en_core_web_sm ``` +* For ML functions (neural networks & SVM), install the optional packages +```text +pip install qrmine[ml] +``` + ### Mac users * Mac users, please install *libomp* for XGBoost ``` brew install libomp ``` -## How to Use +## ๐Ÿš€ How to Use -* input files are transcripts as txt files and a single csv file with numeric data. The output txt file can be specified. +* Input files are transcripts as txt/pdf files and (optionally) a single csv file with numeric data. The output txt file can be specified. All transcripts can be in a single file separated by a break tag as described below. * The coding dictionary, topics and topic assignments can be created from the entire corpus (all documents) using the respective command line options. @@ -140,33 +152,15 @@ index, obesity, bmi, exercise, income, bp, fbs, has_diabetes ## Author -* [Bell Eapen](https://nuchange.ca) (McMaster U) | [Contact](https://nuchange.ca/contact) | [![Twitter Follow](https://img.shields.io/twitter/follow/beapen?style=social)](https://twitter.com/beapen) +* [Bell Eapen](https://nuchange.ca) ([UIS](https://www.uis.edu/directory/bell-punneliparambil-eapen)) | [Contact](https://nuchange.ca/contact) | [![Twitter Follow](https://img.shields.io/twitter/follow/beapen?style=social)](https://twitter.com/beapen) -* This software is developed and tested using [Compute Canada](http://www.computecanada.ca) resources. -* See also: [:fire: The FHIRForm framework for managing healthcare eForms](https://github.com/E-Health/fhirform) -* See also: [:eyes: Drishti | An mHealth sense-plan-act framework!](https://github.com/E-Health/drishti) ## Citation -Please cite QRMine in your publications if it helped your research. Here -is an example BibTeX entry [(Read paper on arXiv)](https://arxiv.org/abs/2003.13519): - -``` - -@article{eapenbr2019qrmine, - title={QRMine: A python package for triangulation in Grounded Theory}, - author={Eapen, Bell Raj and Archer, Norm and Sartpi, Kamran}, - journal={arXiv preprint arXiv:2003.13519 }, - year={2020} -} - -``` - -QRMine is inspired by [this work](https://github.com/lknelson/computational-grounded-theory) and the associated [paper](https://journals.sagepub.com/doi/abs/10.1177/0049124117729703). +Please cite QRMine in your publications if it helped your research. +Citation information will be available soon. ## Give us a star โญ๏ธ If you find this project useful, give us a star. It helps others discover the project. -## Demo -[![QRMine](https://github.com/dermatologist/nlp-qrmine/blob/develop/notes/qrmine.gif)](https://github.com/dermatologist/nlp-qrmine/blob/develop/notes/qrmine.gif) diff --git a/dev-requirements.in b/dev-requirements.in deleted file mode 100644 index 2b56355..0000000 --- a/dev-requirements.in +++ /dev/null @@ -1,11 +0,0 @@ -# dev-requirements.in --c requirements.txt -pytest-cov -pytest -recommonmark -sphinx>=3.2.1 -setuptools -setuptools_scm -wheel>=0.37.0 # conflicts with dependency of tensorflow -tox -pip-tools \ No newline at end of file diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index f36f95c..0000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,146 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile dev-requirements.in -o dev-requirements.txt --universal -alabaster==1.0.0 - # via sphinx -babel==2.16.0 - # via sphinx -build==1.2.2.post1 - # via pip-tools -cachetools==5.5.0 - # via - # -c requirements.txt - # tox -certifi==2024.8.30 - # via - # -c requirements.txt - # requests -chardet==5.2.0 - # via tox -charset-normalizer==3.4.0 - # via - # -c requirements.txt - # requests -click==8.1.7 - # via - # -c requirements.txt - # pip-tools -colorama==0.4.6 - # via - # -c requirements.txt - # build - # click - # pytest - # sphinx - # tox -commonmark==0.9.1 - # via recommonmark -coverage==7.6.4 - # via pytest-cov -distlib==0.3.9 - # via virtualenv -docutils==0.21.2 - # via - # recommonmark - # sphinx -filelock==3.16.1 - # via - # tox - # virtualenv -idna==3.10 - # via - # -c requirements.txt - # requests -imagesize==1.4.1 - # via sphinx -iniconfig==2.0.0 - # via pytest -jinja2==3.1.4 - # via - # -c requirements.txt - # sphinx -markupsafe==3.0.2 - # via - # -c requirements.txt - # jinja2 -packaging==24.2 - # via - # -c requirements.txt - # build - # pyproject-api - # pytest - # setuptools-scm - # sphinx - # tox -pip==24.3.1 - # via pip-tools -pip-tools==7.4.1 - # via -r dev-requirements.in -platformdirs==4.3.6 - # via - # tox - # virtualenv -pluggy==1.5.0 - # via - # pytest - # tox -pygments==2.18.0 - # via - # -c requirements.txt - # sphinx -pyproject-api==1.8.0 - # via tox -pyproject-hooks==1.2.0 - # via - # build - # pip-tools -pytest==8.3.3 - # via - # -r dev-requirements.in - # pytest-cov -pytest-cov==6.0.0 - # via -r dev-requirements.in -recommonmark==0.7.1 - # via -r dev-requirements.in -requests==2.32.3 - # via - # -c requirements.txt - # sphinx -setuptools==75.3.0 - # via - # -c requirements.txt - # -r dev-requirements.in - # pip-tools - # setuptools-scm -setuptools-scm==8.1.0 - # via -r dev-requirements.in -snowballstemmer==2.2.0 - # via sphinx -sphinx==8.1.3 - # via - # -r dev-requirements.in - # recommonmark -sphinxcontrib-applehelp==2.0.0 - # via sphinx -sphinxcontrib-devhelp==2.0.0 - # via sphinx -sphinxcontrib-htmlhelp==2.1.0 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==2.0.0 - # via sphinx -sphinxcontrib-serializinghtml==2.0.0 - # via sphinx -tox==4.23.2 - # via -r dev-requirements.in -urllib3==2.2.3 - # via - # -c requirements.txt - # requests -virtualenv==20.27.1 - # via tox -wheel==0.45.0 - # via - # -c requirements.txt - # -r dev-requirements.in - # pip-tools diff --git a/notes/conda.md b/notes/conda.md new file mode 100644 index 0000000..79eb6c8 --- /dev/null +++ b/notes/conda.md @@ -0,0 +1,12 @@ +conda create --name qrmine python=3.11 +conda activate qrmine + +conda install conda-forge::uv +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml +uv pip install -e . +python -m spacy download en_core_web_sm + + + +pip3 install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html \ No newline at end of file diff --git a/notes/new-process.md b/notes/new-process.md new file mode 100644 index 0000000..08b6584 --- /dev/null +++ b/notes/new-process.md @@ -0,0 +1,35 @@ +conda install conda-forge::uv +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml + +delete setup.cpg +delete requirements.txt, dev-requirements.txt, dev-requirements.in +remove deps from tox.ini + +uv pip install -e . +see pr.yml for GitHub actions +see pyproject.toml for pytorch cpu install +uv pip install -e . + +uv sync --all-extras --dev +uv pip install pip +uv run python -m spacy download en_core_web_sm + +pyproject.toml +=============== +requires = ["setuptools>=61.2", "wheel", "pip"] +license = "GPL-3.0" #This should be a string +dev = [ + "setuptools", + "setuptools_scm", + "pytest", + "pytest-cov", + "tox", + "black", + "recommonmark", + "sphinx", + "wheel", + "twine", + "tox", +] + diff --git a/notes/pip-tools.md b/notes/pip-tools.md index da4baa4..c504a1e 100644 --- a/notes/pip-tools.md +++ b/notes/pip-tools.md @@ -21,4 +21,7 @@ OR * pip install uv * uv pip compile setup.cfg -o requirements.txt --universal -* uv pip compile dev-requirements.in -o dev-requirements.txt --universal \ No newline at end of file +* uv pip compile dev-requirements.in -o dev-requirements.txt --universal + +uv pip install ini2toml +ini2toml setup.cfg -o pyproject.toml \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 89a5bed..ac97fc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,192 @@ [build-system] -# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! -requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"] +requires = ["setuptools", "wheel", "pip"] build-backend = "setuptools.build_meta" -[tool.setuptools_scm] -# For smarter version schemes and other configuration options, -# check out https://github.com/pypa/setuptools_scm -version_scheme = "no-guess-dev" +[project] +name = "qrmine" +description = "Qualitative Research support tools in Python!" +authors = [{name = "beapen", email = "github@gulfdoctor.net"}] +license = "GPL-3.0" +# license_files = LICENSE.txt +# long_description = file: README.rst +# long_description_content_type = text/x-rst; charset=UTF-8 +classifiers = [ + "Intended Audience :: Science/Research", + "Development Status :: 4 - Beta", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Information Analysis", +] +requires-python = ">=3.11, <3.12" +dependencies = [ + 'importlib-metadata; python_version<"3.8"', + "pandas", + "mlxtend", + "matplotlib", + "click", + "vaderSentiment", + "spacy", + "textacy", + "pypdf", + "requests", + "gensim", + "seaborn", + "wordcloud", + "tabulate", +] +dynamic = ["version"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" +# Add here related links, for example: + +[project.urls] +Homepage = "https://github.com/dermatologist/nlp-qrmine" +Documentation = "https://arxiv.org/abs/2003.13519" +# Source = https://github.com/pyscaffold/pyscaffold/ +# Changelog = https://pyscaffold.org/en/latest/changelog.html +# Tracker = https://github.com/pyscaffold/pyscaffold/issues +# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold +# Download = https://pypi.org/project/PyScaffold/#files +# Twitter = https://twitter.com/PyScaffold +# Change if running only on Windows, Mac or Linux (comma-separated) +# Add here all kinds of additional classifiers as defined under +# https://pypi.org/classifiers/ + +[project.optional-dependencies] +# Add here additional requirements for extra features, to install with: +# `pip install qrmine[PDF]` like: +# PDF = ReportLab; RXP +# Add here test requirements (semicolon/line-separated) +testing = [ + "setuptools", + "pytest", + "pytest-cov", +] + +dev = [ + "setuptools>=77.0.0", + "packaging>=24.2", + "setuptools_scm", + "pytest", + "pytest-cov", + "tox", + "black", + "recommonmark", + "sphinx", + "wheel", + "twine", + "build", +] + +ml = [ + "scikit-learn", + "imbalanced-learn", + "xgboost", + "torch==2.2.2", +] + +[project.entry-points] +# Add here console scripts like: +# console_scripts = +# script_name = qrmine.module:function +# For example: +# console_scripts = +# fibonacci = qrmine.skeleton:run +# And any other entry points, for example: +# pyscaffold.cli = +# awesome = pyscaffoldext.awesome.extension:AwesomeExtension + +[project.scripts] +qrmine = "qrmine.main:main_routine" + +[tool.setuptools] +zip-safe = false +include-package-data = true +package-dir = {"" = "src"} +# Require a min/specific Python version (comma-separated conditions) +# python_requires = >=3.8 +# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. +# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in +# new major versions. This works if the required packages follow Semantic Versioning. +# For more information, check out https://semver.org/. +platforms = ["any"] + + +[tool.setuptools.packages.find] +where = [ "src"] +exclude = [ "tests", "notes", "docs"] +namespaces = true + +[tool.pytest.ini_options] +# Specify command line options as you would do when invoking pytest directly. +# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml +# in order to write a coverage file that can be read by Jenkins. +# CAUTION: --cov flags may prohibit setting breakpoints while debugging. +# Comment those flags to avoid this pytest issue. +addopts = """ +--verbose""" +norecursedirs = [ + "dist", + "build", + ".tox", +] + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu" }, +] +torchvision = [ + { index = "pytorch-cpu" }, +] + +[tool.aliases] +release = "sdist bdist_wheel upload" + +[tool.distutils.bdist_wheel] +# Use this option if your package is pure-python +universal = 0 + +[tool.build_sphinx] +source_dir = "docs" +build_dir = "docs/_build" +testpaths = "tests" +# Use pytest markers to select/deselect specific tests +# markers = +# slow: mark tests as slow (deselect with '-m "not slow"') +# system: mark end-to-end system tests + +[tool.devpi.upload] +# Options for the devpi: PyPI server and packaging tool +# VCS export must be deactivated since we are using setuptools-scm +no_vcs = "1" +formats = "bdist_wheel" + +[tool.flake8] +# Some sane defaults for the code style checker flake8 +max_line_length = "88" +extend_ignore = "E203, W503" +# ^ Black-compatible +# E203 and W503 have edge cases handled by black +exclude = """ +.tox +build +dist +.eggs +docs/conf.py""" + +[tool.pyscaffold] +# PyScaffold's parameters when the project was created. +# This will be used when updating. Do not change! +version = "4.6" +package = "qrmine" +# This file is used to configure your project. +# Read more about the various options under: +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html +# https://setuptools.pypa.io/en/latest/references/keywords.html diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 260d413..0000000 --- a/requirements.txt +++ /dev/null @@ -1,314 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile setup.cfg -o requirements.txt --universal -absl-py==2.1.0 - # via - # tensorboard - # tensorflow -astunparse==1.6.3 - # via tensorflow -blis==0.7.11 - # via thinc -cachetools==5.5.0 - # via - # google-auth - # textacy -catalogue==2.0.10 - # via - # spacy - # srsly - # textacy - # thinc -certifi==2024.8.30 - # via requests -charset-normalizer==3.4.0 - # via requests -click==8.1.7 - # via - # qrmine (setup.cfg) - # typer -cloudpathlib==0.20.0 - # via weasel -colorama==0.4.6 ; sys_platform == 'win32' or platform_system == 'Windows' - # via - # click - # tqdm - # wasabi -confection==0.1.5 - # via - # thinc - # weasel -contourpy==1.3.0 - # via matplotlib -cycler==0.12.1 - # via matplotlib -cymem==2.0.8 - # via - # preshed - # spacy - # thinc -cytoolz==1.0.0 - # via textacy -flatbuffers==24.3.25 - # via tensorflow -floret==0.10.5 - # via textacy -fonttools==4.54.1 - # via matplotlib -gast==0.4.0 - # via tensorflow -google-auth==2.36.0 - # via - # google-auth-oauthlib - # tensorboard -google-auth-oauthlib==1.0.0 - # via tensorboard -google-pasta==0.2.0 - # via tensorflow -grpcio==1.67.1 - # via - # tensorboard - # tensorflow -h5py==3.12.1 - # via tensorflow -idna==3.10 - # via requests -imbalanced-learn==0.12.4 - # via qrmine (setup.cfg) -jellyfish==1.1.0 - # via textacy -jinja2==3.1.6 - # via spacy -joblib==1.4.2 - # via - # imbalanced-learn - # mlxtend - # scikit-learn - # textacy -keras==2.13.1 - # via tensorflow -kiwisolver==1.4.7 - # via matplotlib -langcodes==3.4.1 - # via spacy -language-data==1.2.0 - # via langcodes -libclang==18.1.1 - # via tensorflow -marisa-trie==1.2.1 - # via language-data -markdown==3.7 - # via tensorboard -markdown-it-py==3.0.0 - # via rich -markupsafe==3.0.2 - # via - # jinja2 - # werkzeug -matplotlib==3.9.2 - # via - # qrmine (setup.cfg) - # mlxtend -mdurl==0.1.2 - # via markdown-it-py -mlxtend==0.23.2 - # via qrmine (setup.cfg) -murmurhash==1.0.10 - # via - # preshed - # spacy - # thinc -networkx==3.4.2 - # via textacy -numpy==1.24.3 - # via - # blis - # contourpy - # floret - # h5py - # imbalanced-learn - # matplotlib - # mlxtend - # pandas - # scikit-learn - # scipy - # spacy - # tensorboard - # tensorflow - # textacy - # thinc - # xgboost -nvidia-nccl-cu12==2.23.4 ; platform_machine != 'aarch64' and platform_system == 'Linux' - # via xgboost -oauthlib==3.2.2 - # via requests-oauthlib -opt-einsum==3.4.0 - # via tensorflow -packaging==24.2 - # via - # matplotlib - # spacy - # tensorflow - # thinc - # weasel -pandas==2.1.0 ; python_full_version >= '3.12' - # via - # qrmine (setup.cfg) - # mlxtend -pandas==2.2.3 ; python_full_version < '3.12' - # via - # qrmine (setup.cfg) - # mlxtend -pillow==11.0.0 - # via matplotlib -preshed==3.0.9 - # via - # spacy - # thinc -protobuf==4.25.5 - # via - # tensorboard - # tensorflow -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pydantic==1.10.19 - # via - # confection - # spacy - # thinc - # weasel -pygments==2.18.0 - # via rich -pyparsing==3.2.0 - # via matplotlib -pyphen==0.17.0 - # via textacy -python-dateutil==2.9.0.post0 - # via - # matplotlib - # pandas -pytz==2024.2 - # via pandas -requests==2.32.3 - # via - # requests-oauthlib - # spacy - # tensorboard - # textacy - # vadersentiment - # weasel -requests-oauthlib==2.0.0 - # via google-auth-oauthlib -rich==13.9.4 - # via typer -rsa==4.9 - # via google-auth -scikit-learn==1.5.2 - # via - # qrmine (setup.cfg) - # imbalanced-learn - # mlxtend - # textacy -scipy==1.14.1 - # via - # imbalanced-learn - # mlxtend - # scikit-learn - # textacy - # xgboost -setuptools==75.3.0 - # via - # marisa-trie - # spacy - # tensorboard - # tensorflow - # thinc -shellingham==1.5.4 - # via typer -six==1.16.0 - # via - # astunparse - # google-pasta - # python-dateutil - # tensorflow -smart-open==7.0.5 - # via weasel -spacy==3.7.5 - # via - # qrmine (setup.cfg) - # textacy -spacy-legacy==3.0.12 - # via spacy -spacy-loggers==1.0.5 - # via spacy -srsly==2.4.8 - # via - # confection - # spacy - # thinc - # weasel -tensorboard==2.13.0 - # via tensorflow -tensorboard-data-server==0.7.2 - # via tensorboard -tensorflow==2.13.1 - # via qrmine (setup.cfg) -tensorflow-estimator==2.13.0 - # via tensorflow -tensorflow-io-gcs-filesystem==0.31.0 - # via - # qrmine (setup.cfg) - # tensorflow -termcolor==2.5.0 - # via tensorflow -textacy==0.13.0 - # via qrmine (setup.cfg) -thinc==8.2.5 - # via spacy -threadpoolctl==3.5.0 - # via - # imbalanced-learn - # scikit-learn -toolz==1.0.0 - # via cytoolz -tqdm==4.67.0 - # via - # spacy - # textacy -typer==0.13.0 - # via - # spacy - # weasel -typing-extensions==4.5.0 - # via - # pydantic - # tensorflow - # typer -tzdata==2024.2 - # via pandas -urllib3==2.2.3 - # via requests -vadersentiment==3.3.2 - # via qrmine (setup.cfg) -wasabi==1.1.3 - # via - # spacy - # thinc - # weasel -weasel==0.4.1 - # via spacy -werkzeug==3.1.3 - # via tensorboard -wheel==0.45.0 - # via - # astunparse - # tensorboard -wrapt==1.16.0 - # via - # smart-open - # tensorflow -xgboost==2.1.2 - # via qrmine (setup.cfg) diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index e6953b9..0000000 --- a/setup.cfg +++ /dev/null @@ -1,152 +0,0 @@ -# This file is used to configure your project. -# Read more about the various options under: -# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html -# https://setuptools.pypa.io/en/latest/references/keywords.html - -[metadata] -name = qrmine -description = Qualitative Research support tools in Python! -author = beapen -author_email = github@gulfdoctor.net -license = GPL-3.0-only -# license_files = LICENSE.txt -# long_description = file: README.rst -# long_description_content_type = text/x-rst; charset=UTF-8 -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/dermatologist/nlp-qrmine -# Add here related links, for example: -project_urls = - Documentation = https://arxiv.org/abs/2003.13519 -# Source = https://github.com/pyscaffold/pyscaffold/ -# Changelog = https://pyscaffold.org/en/latest/changelog.html -# Tracker = https://github.com/pyscaffold/pyscaffold/issues -# Conda-Forge = https://anaconda.org/conda-forge/pyscaffold -# Download = https://pypi.org/project/PyScaffold/#files -# Twitter = https://twitter.com/PyScaffold - -# Change if running only on Windows, Mac or Linux (comma-separated) -platforms = any - -# Add here all kinds of additional classifiers as defined under -# https://pypi.org/classifiers/ -classifiers = - Intended Audience :: Science/Research - Development Status :: 4 - Beta - Operating System :: OS Independent - Programming Language :: Python :: 3.11 - Topic :: Scientific/Engineering :: Information Analysis - - -[options] -zip_safe = False -packages = find_namespace: -include_package_data = True -package_dir = - =src - -# Require a min/specific Python version (comma-separated conditions) -# python_requires = >=3.8 - -# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. -# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in -# new major versions. This works if the required packages follow Semantic Versioning. -# For more information, check out https://semver.org/. -install_requires = - importlib-metadata; python_version<"3.8" - pandas - matplotlib - click - scikit-learn - imbalanced-learn - vaderSentiment - xgboost - mlxtend - spacy - textacy - tensorflow<=2.13.1 - tensorflow-io-gcs-filesystem<=0.31.0 - -[options.packages.find] -where = src -exclude = - tests - -[options.extras_require] -# Add here additional requirements for extra features, to install with: -# `pip install qrmine[PDF]` like: -# PDF = ReportLab; RXP - -# Add here test requirements (semicolon/line-separated) -testing = - setuptools - pytest - pytest-cov - -[options.entry_points] -# Add here console scripts like: -# console_scripts = -# script_name = qrmine.module:function -# For example: -# console_scripts = -# fibonacci = qrmine.skeleton:run -# And any other entry points, for example: -# pyscaffold.cli = -# awesome = pyscaffoldext.awesome.extension:AwesomeExtension -console_scripts = - qrmine = qrmine.main:main_routine - -[tool:pytest] -# Specify command line options as you would do when invoking pytest directly. -# e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml -# in order to write a coverage file that can be read by Jenkins. -# CAUTION: --cov flags may prohibit setting breakpoints while debugging. -# Comment those flags to avoid this pytest issue. -addopts = - --verbose -norecursedirs = - dist - build - .tox - -[aliases] -release = sdist bdist_wheel upload - -[bdist_wheel] -# Use this option if your package is pure-python -universal = 1 - -[build_sphinx] -source_dir = docs -build_dir = docs/_build - -testpaths = tests -# Use pytest markers to select/deselect specific tests -# markers = -# slow: mark tests as slow (deselect with '-m "not slow"') -# system: mark end-to-end system tests - -[devpi:upload] -# Options for the devpi: PyPI server and packaging tool -# VCS export must be deactivated since we are using setuptools-scm -no_vcs = 1 -formats = bdist_wheel - -[flake8] -# Some sane defaults for the code style checker flake8 -max_line_length = 88 -extend_ignore = E203, W503 -# ^ Black-compatible -# E203 and W503 have edge cases handled by black -exclude = - .tox - build - dist - .eggs - docs/conf.py - -[pyscaffold] -# PyScaffold's parameters when the project was created. -# This will be used when updating. Do not change! -version = 4.6 -package = qrmine diff --git a/src/qrmine/__init__.py b/src/qrmine/__init__.py index 09a4e35..3549721 100644 --- a/src/qrmine/__init__.py +++ b/src/qrmine/__init__.py @@ -6,6 +6,8 @@ from .readfiles import ReadData from .sentiment import Sentiment from .mlqrmine import MLQRMine +from .cluster import ClusterDocs +from .visualize import QRVisualize if sys.version_info[:2] >= (3, 8): # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` diff --git a/src/qrmine/cluster.py b/src/qrmine/cluster.py new file mode 100644 index 0000000..e67434c --- /dev/null +++ b/src/qrmine/cluster.py @@ -0,0 +1,273 @@ +""" +Copyright (C) 2025 Bell Eapen + +This file is part of qrmine. + +qrmine is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +qrmine is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with qrmine. If not, see . +""" + + +import pandas as pd +import numpy as np +from gensim import corpora +from gensim.models.ldamodel import LdaModel +from gensim.models import Word2Vec +from sklearn.manifold import TSNE +from sklearn.cluster import KMeans +from tabulate import tabulate + +from .content import Content + +class ClusterDocs: + + def __init__(self, content: Content, documents = [], titles=[]): + self._content = content + self._documents = documents + self._titles = titles + self._num_topics = 5 + self._passes = 15 + self._dictionary = None + self._corpus = None + self._lda_model = None + self._corpus = None + # Apply preprocessing to each document + self._processed_docs = [self.preprocess(doc) for doc in documents] + self.process() + + @property + def documents(self): + return self._documents + + @property + def titles(self): + return self._titles + + @property + def num_topics(self): + return self._num_topics + + @property + def passes(self): + return self._passes + + @property + def processed_docs(self): + return self._processed_docs + + @property + def lda_model(self): + return self._lda_model + + @property + def corpus(self): + return self._corpus + + @documents.setter + def documents(self, documents): + self._documents = documents + self._processed_docs = [self.preprocess(doc) for doc in documents] + self.process() + + @titles.setter + def titles(self, titles): + self._titles = titles + + @num_topics.setter + def num_topics(self, num_topics): + self._num_topics = num_topics + + @passes.setter + def passes(self, passes): + self._passes = passes + + # Preprocess the documents using spaCy + def preprocess(self, doc): + self._content.content = doc + return self._content.tokens + + def process(self): + # Create a dictionary representation of the documents + self._dictionary = corpora.Dictionary(self._processed_docs) + # Create a bag-of-words representation of the documents + self._corpus = [self._dictionary.doc2bow(doc) for doc in self._processed_docs] + # Build the LDA (Latent Dirichlet Allocation) model + + def build_lda_model(self): + if self._lda_model is None: + self._lda_model = LdaModel( + self._corpus, + num_topics=self._num_topics, + id2word=self._dictionary, + passes=self._passes, + ) + return self._lda_model.show_topics(formatted=False) + + def print_topics(self, num_words=5): + if self._lda_model is None: + self.build_lda_model() + # Print the topics and their corresponding words + # print(self._lda_model.print_topics(num_words=num_words)) + output = self._lda_model.print_topics(num_words=num_words) + """ Output is like: + [(0, '0.116*"category" + 0.093*"comparison" + 0.070*"incident" + 0.060*"theory" + 0.025*"Theory"'), (1, '0.040*"GT" + 0.026*"emerge" + 0.026*"pragmatic" + 0.026*"Barney" + 0.026*"contribution"'), (2, '0.084*"theory" + 0.044*"GT" + 0.044*"evaluation" + 0.024*"structure" + 0.024*"Glaser"'), (3, '0.040*"open" + 0.040*"QRMine" + 0.040*"coding" + 0.040*"category" + 0.027*"researcher"'), (4, '0.073*"coding" + 0.046*"structure" + 0.045*"GT" + 0.042*"Strauss" + 0.038*"Corbin"')] + format this into human readable format as below: + Topic 0: category(0.116), comparison(0.093), incident(0.070), theory(0.060), Theory(0.025) + """ + print("\nTopics: \n") + for topic in output: + topic_num = topic[0] + topic_words = topic[1] + words = [] + for word in topic_words.split("+"): + word = word.split("*") + words.append(f"{word[1].strip()}({word[0].strip()})") + print(f"Topic {topic_num}: {', '.join(words)}") + return output + + def print_clusters(self): + if self._lda_model is None: + self.build_lda_model() + # Perform semantic clustering + print("\n Main topic in doc: \n") + + for i, doc in enumerate( + self._processed_docs + ): # Changed from get_processed_docs() to _documents + bow = self._dictionary.doc2bow(doc) + print( + f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}" + ) + + def format_topics_sentences(self, visualize=False): + self.build_lda_model() + # Init output + sent_topics_df = pd.DataFrame() + + # Get main topic in each document + for i, row_list in enumerate(self._lda_model[self._corpus]): + row = row_list[0] if self._lda_model.per_word_topics else row_list + # print(row) + row = sorted(row, key=lambda x: (x[1]), reverse=True) + # Get the Dominant topic, Perc Contribution and Keywords for each document + for j, (topic_num, prop_topic) in enumerate(row): + if j == 0: # => dominant topic + wp = self._lda_model.show_topic(topic_num) + topic_keywords = ", ".join([word for word, prop in wp]) + new_row = pd.DataFrame( + [[self._titles[i], int(topic_num), round(prop_topic, 4), topic_keywords]], + columns=[ + "Title", + "Dominant_Topic", + "Perc_Contribution", + "Topic_Keywords", + ], + ) + sent_topics_df = pd.concat( + [sent_topics_df, new_row], ignore_index=True + ) + else: + break + sent_topics_df.columns = [ + "Title", + "Dominant_Topic", + "Perc_Contribution", + "Topic_Keywords", + ] + + # Add original text to the end of the output + if visualize: + contents = pd.Series(self._processed_docs) + sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) + return sent_topics_df.reset_index(drop=False) + + # https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/ + def most_representative_docs(self): + sent_topics_df = self.format_topics_sentences() + sent_topics_sorteddf_mallet = pd.DataFrame() + sent_topics_outdf_grpd = sent_topics_df.groupby("Dominant_Topic") + + for i, grp in sent_topics_outdf_grpd: + sent_topics_sorteddf_mallet = pd.concat( + [ + sent_topics_sorteddf_mallet, + grp.sort_values(["Perc_Contribution"], ascending=False).head(1), + ], + axis=0, + ) + + return sent_topics_sorteddf_mallet + + def topics_per_document(self, start=0, end=1): + corpus_sel = self._corpus[start:end] + dominant_topics = [] + topic_percentages = [] + for i, corp in enumerate(corpus_sel): + topic_percs = self._lda_model[corp] + dominant_topic = sorted(topic_percs, key=lambda x: x[1], reverse=True)[0][0] + dominant_topics.append((i, dominant_topic)) + topic_percentages.append(topic_percs) + return (dominant_topics, topic_percentages) + + # Get average embedding vector for each text + + def doc_vectorizer(self, doc, model): + doc_vector = [] + num_words = 0 + for word in doc: + try: + if num_words == 0: + doc_vector = model.wv[word] + else: + doc_vector = np.add(doc_vector, model.wv[word]) + num_words += 1 + except: + # pass if word is not found + pass + + return np.asarray(doc_vector) / num_words + + def vectorizer(self, docs, titles, num_clusters=4, visualize=False): + X = [] + T = [] + model = Word2Vec(docs, min_count=20, vector_size=50) + for index, doc in enumerate(docs): + X.append(self.doc_vectorizer(doc, model)) + T.append(titles[index]) + print('Averaged text w2v representstion:') + print(X[0]) + _X = np.array(X) + print(_X.shape) + tsne = TSNE(n_components=2, random_state=0) + tsne_model = tsne.fit_transform(_X) + # Obtain the prediction + kmeans = KMeans(n_clusters=num_clusters, random_state=0) + y_pred = kmeans.fit(tsne_model).predict(tsne_model) + data = pd.DataFrame( + np.concatenate([tsne_model, y_pred[:, None]], axis=1), + columns=["x", "y", "colour"], + ) + # Add the titles to the DataFrame + data["title"] = T + if not visualize: + print( + tabulate( + data, + headers="keys", + tablefmt="psql", + showindex=False, + numalign="left", + stralign="left", + ) + ) + return data diff --git a/src/qrmine/content.py b/src/qrmine/content.py index 3344a80..c67285f 100644 --- a/src/qrmine/content.py +++ b/src/qrmine/content.py @@ -1,20 +1,20 @@ """ - Copyright (C) 2020 Bell Eapen +Copyright (C) 2020 Bell Eapen - This file is part of qrmine. +This file is part of qrmine. - qrmine is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. +qrmine is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. - qrmine is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. +qrmine is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with qrmine. If not, see . +You should have received a copy of the GNU General Public License +along with qrmine. If not, see . """ import operator @@ -23,10 +23,15 @@ import textacy + class Content(object): - def __init__(self, content): + def __init__(self, content="", title="", lang="en_core_web_sm", max_length=1100000): self._content = content - self._nlp = textacy.load_spacy_lang("en_core_web_sm") + # TODO, Title is not used + self._title = title + self._lang = lang + self._nlp = textacy.load_spacy_lang(lang) + self._nlp.max_length = max_length self._processed = self._nlp(self._content) self._lemma = {} self._pos = {} @@ -43,17 +48,21 @@ def __init__(self, content): def content(self): return self._content + @property + def title(self): + return self._title + @content.setter def content(self, content): self._content = content @property def lemma(self, token): - return self._lemma.get(token, '') + return self._lemma.get(token, "") @property def pos(self, token): - return self._pos.get(token, '') + return self._pos.get(token, "") @property def pos_(self, token): @@ -61,7 +70,7 @@ def pos_(self, token): @property def word(self, token): - return self._word.get(token, '') + return self._word.get(token, "") @property def sentiment(self, token): @@ -69,7 +78,7 @@ def sentiment(self, token): @property def tag(self, token): - return self._tag.get(token, '') + return self._tag.get(token, "") @property def dep(self, token): @@ -87,6 +96,33 @@ def idx(self, token): def doc(self): return self._processed + @property + def tokens(self): + return [ + token.lemma_ + for token in self._processed + if not token.is_stop and not token.is_punct and not token.is_space + ] + + @property + def lang(self): + return self._lang + + @content.setter + def content(self, content): + self._content = content + self._processed = self._nlp(self._content) + self._lemma = {} + self._pos = {} + self._pos_ = {} + self._word = {} + self._sentiment = {} + self._tag = {} + self._dep = {} + self._prob = {} + self._idx = {} + self.process() + def process(self): for token in self._processed: if token.is_stop or token.is_digit or token.is_punct or token.is_space: @@ -114,14 +150,14 @@ def common_words(self, index=10): def common_nouns(self, index=10): _words = {} for key, value in self._word.items(): - if self._pos.get(key, None) == 'NOUN': + if self._pos.get(key, None) == "NOUN": _words[value] = _words.get(value, 0) + 1 return sorted(_words.items(), key=operator.itemgetter(1), reverse=True)[:index] def common_verbs(self, index=10): _words = {} for key, value in self._word.items(): - if self._pos.get(key, None) == 'VERB': + if self._pos.get(key, None) == "VERB": _words[value] = _words.get(value, 0) + 1 return sorted(_words.items(), key=operator.itemgetter(1), reverse=True)[:index] @@ -135,7 +171,9 @@ def sentences_with_common_nouns(self, index=10): for span in self._processed.sents: # go from the start to the end of each span, returning each token in the sentence # combine each token using join() - sent = ''.join(self._processed[i].string for i in range(span.start, span.end)).strip() + sent = "".join( + self._processed[i].string for i in range(span.start, span.end) + ).strip() for noun, freq in _nouns: if noun in sent: sents.append(sent) @@ -151,7 +189,7 @@ def spans_with_common_nouns(self, word): # go from the start to the end of each span, returning each token in the sentence # combine each token using join() for token in span: - if word in self._word.get(token, ' '): + if word in self._word.get(token, " "): spans.append(span) return spans @@ -160,11 +198,11 @@ def dimensions(self, word, index=3): _ad = {} for span in _spans: for token in span: - if self._pos.get(token, None) == 'ADJ': + if self._pos.get(token, None) == "ADJ": _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1 - if self._pos.get(token, None) == 'ADV': + if self._pos.get(token, None) == "ADV": _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1 - if self._pos.get(token, None) == 'VERB': + if self._pos.get(token, None) == "VERB": _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1 return sorted(_ad.items(), key=operator.itemgetter(1), reverse=True)[:index] @@ -173,7 +211,9 @@ def attributes(self, word, index=3): _ad = {} for span in _spans: for token in span: - if self._pos.get(token, None) == 'NOUN' and word not in self._word.get(token, ''): + if self._pos.get(token, None) == "NOUN" and word not in self._word.get( + token, "" + ): _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1 # if self._pos.get(token, None) == 'VERB': # _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1 diff --git a/src/qrmine/main.py b/src/qrmine/main.py index 374c496..c3c6ace 100644 --- a/src/qrmine/main.py +++ b/src/qrmine/main.py @@ -2,6 +2,7 @@ import click import textacy +from tabulate import tabulate from . import Content from . import Network @@ -9,70 +10,201 @@ from . import ReadData from . import Sentiment from . import MLQRMine +from . import ClusterDocs +from .visualize import QRVisualize +from .utils import QRUtils from . import __version__ +q = Qrmine() @click.command() -@click.option('--verbose', '-v', is_flag=True, help="Will print verbose messages.") -@click.option('--inp', '-i', multiple=True, - help='Input file in the text format with Topic') -@click.option('--out', '-o', multiple=False, default='', - help='Output file name') -@click.option('--csv', multiple=False, default='', - help='csv file name') -@click.option('--num', '-n', multiple=False, default=3, - help='N (clusters/epochs etc depending on context)') -@click.option('--rec', '-r', multiple=False, default=3, - help='Record (based on context)') -@click.option('--titles', '-t', multiple=True, - help='Document(s) or csv title(s) to analyze/compare') -@click.option('--filters', '-f', multiple=True, - help='Filters to apply') -@click.option('--codedict', is_flag=True, - help='Generate coding dictionary') -@click.option('--topics', is_flag=True, - help='Generate topic model') -@click.option('--assign', is_flag=True, - help='Assign documents to topics') -@click.option('--cat', is_flag=True, - help='List categories of entire corpus or individual docs') -@click.option('--summary', is_flag=True, - help='Generate summary for entire corpus or individual docs') -@click.option('--sentiment', is_flag=True, - help='Generate sentiment score for entire corpus or individual docs') -@click.option('--sentence', is_flag=True, default=False, - help='Generate sentence level scores when applicable') -@click.option('--nlp', is_flag=True, - help='Generate all NLP reports') -@click.option('--nnet', is_flag=True, - help='Display accuracy of a neural network model') -@click.option('--svm', is_flag=True, - help='Display confusion matrix from an svm classifier') -@click.option('--knn', is_flag=True, - help='Display nearest neighbours') -@click.option('--kmeans', is_flag=True, - help='Display KMeans clusters') -@click.option('--cart', is_flag=True, - help='Display Association Rules') -@click.option('--pca', is_flag=True, - help='Display PCA') -def cli(verbose, inp, out, csv, num, rec, titles, filters, codedict, topics, assign, cat, summary, sentiment, sentence, - nlp, nnet, - svm, - knn, kmeans, cart, pca): +@click.option("--verbose", "-v", is_flag=True, help="Will print verbose messages.") +@click.option( + "--covid", "-cf", default="", help="Download COVID narratives from the website" +) +@click.option( + "--inp", + "-i", + multiple=False, + help="Input file in the text format with Topic", +) +@click.option("--out", "-o", multiple=False, default="", help="Output file name") +@click.option("--csv", multiple=False, default="", help="csv file name") +@click.option( + "--num", + "-n", + multiple=False, + default=3, + help="N (clusters/epochs etc depending on context)", +) +@click.option( + "--rec", "-r", multiple=False, default=3, help="Record (based on context)" +) +@click.option( + "--titles", + "-t", + multiple=True, + help="Document(s) or csv title(s) to analyze/compare", +) +@click.option("--filters", "-f", multiple=True, help="Filters to apply") +@click.option("--codedict", is_flag=True, help="Generate coding dictionary") +@click.option("--topics", is_flag=True, help="Generate topic model") +@click.option("--assign", is_flag=True, help="Assign documents to topics") +@click.option( + "--cat", is_flag=True, help="List categories of entire corpus or individual docs" +) +@click.option( + "--summary", + is_flag=True, + help="Generate summary for entire corpus or individual docs", +) +@click.option( + "--sentiment", + is_flag=True, + help="Generate sentiment score for entire corpus or individual docs", +) +@click.option( + "--sentence", + is_flag=True, + default=False, + help="Generate sentence level scores when applicable", +) +@click.option("--nlp", is_flag=True, help="Generate all NLP reports") +@click.option("--nnet", is_flag=True, help="Display accuracy of a neural network model") +@click.option( + "--svm", is_flag=True, help="Display confusion matrix from an svm classifier" +) +@click.option("--knn", is_flag=True, help="Display nearest neighbours") +@click.option("--kmeans", is_flag=True, help="Display KMeans clusters") +@click.option("--cart", is_flag=True, help="Display Association Rules") +@click.option("--pca", is_flag=True, help="Display PCA") +@click.option("--visualize", '-v', is_flag=False, help="Visualize words, tpopics or wordcloud. ") +@click.option("--ignore", is_flag=False, help="Comma separated ignore words") +def cli( + verbose, + covid, + inp, + out, + csv, + num, + rec, + titles, + filters, + codedict, + topics, + assign, + cat, + summary, + sentiment, + sentence, + nlp, + nnet, + svm, + knn, + kmeans, + cart, + pca, + visualize, + ignore, +): + if covid: + qr_utils = QRUtils() + qr_utils.read_covid_narratives(covid) + click.echo("COVID narratives downloaded to " + covid) data = ReadData() if inp: - data.read_file(inp) + if ignore: + data.read_file(inp, ignore) + else: + data.read_file(inp) if len(filters) > 0: data = filter_data(inp, filters, sentence, num) if verbose: click.echo("We are in the verbose mode.") if out: - sys.stdout = open(out, 'w') + sys.stdout = open(out, "w") if inp and codedict: generate_dict(data, num) + content = Content(data.content) + cluster = ClusterDocs(content) + cluster.documents = data.documents + cluster.titles = data.titles if inp and topics: - generate_topics(data, assign, num) + # generate_topics(data, assign, num) + click.echo("---------------------------") + cluster.print_topics() + click.echo("---------------------------") + click.echo("Dominant topic and its percentage contribution in each document") + topics = cluster.format_topics_sentences() + click.echo( + tabulate( + topics, + headers="keys", + tablefmt="grid", + showindex="never", + numalign="left", + maxcolwidths=[10, 10, 10, 50], + ) + ) + click.echo("Most representative document for each topic") + most_representative_docs = cluster.most_representative_docs() + click.echo( + tabulate( + most_representative_docs, + headers="keys", + tablefmt="grid", + showindex="never", + numalign="left", + maxcolwidths=[10, 10, 10, 50], + ) + ) + if visualize: + _data = cluster.format_topics_sentences(visualize=True) + _topics = cluster.build_lda_model() + _processed_docs = cluster.processed_docs + _lda_model = cluster.lda_model + _corpus = cluster.corpus + match visualize: + case "wordcloud": + v = QRVisualize(data) + v.plot_wordcloud(topics=_topics, folder_path=out) + case "topics": + v = QRVisualize(_data) + v.plot_distribution_by_topic( + _data, folder_path=out + ) + case "words": + v = QRVisualize(_data) + v.plot_frequency_distribution_of_words(folder_path=out) + case "importance": + v = QRVisualize(_data) + v.plot_importance(topics=_topics, processed_docs=_processed_docs, folder_path=out) + case "sentence": + v = QRVisualize(_data) + v.sentence_chart( + _lda_model, _corpus, folder_path=out + ) + # case "cluster": + # v = QRVisualize(_data) + # if num: + # v.cluster_chart( + # _lda_model, _corpus, num, folder_path=out + # ) + # else: + # v.cluster_chart( + # _lda_model, _corpus, folder_path=out + # ) + case "cluster": + v = QRVisualize(_data) + for doc in data.documents: + print(doc+ "\n") + vectors = cluster.vectorizer(data.documents, data.titles, visualize=True) + v.cluster_chart( + vectors, folder_path=out + ) + + + # if inp and assign: # assign_topics(data) if inp and cat: @@ -81,7 +213,9 @@ def cli(verbose, inp, out, csv, num, rec, titles, filters, codedict, topics, ass generate_summary(data, titles) if inp and sentiment: get_sentiment(data, titles, sentence, verbose) - if inp and cart: #python qrminer.py --cart -i src/qrmine/resources/interview.txt -n 10 + if ( + inp and cart + ): # python qrminer.py --cart -i src/qrmine/resources/interview.txt -n 10 get_categories_association(data, num) if inp and nlp: main(inp) @@ -128,20 +262,20 @@ def filter_data(inp, search, sentence, num): filters = [] for s in search: - if s == 'pos': + if s == "pos": for title in data.titles: t = [title] - if get_sentiment(data, t, sentence, False) == 'pos': + if get_sentiment(data, t, sentence, False) == "pos": filters.append(title) - if s == 'neg': + if s == "neg": for title in data.titles: t = [title] - if get_sentiment(data, t, sentence, False) == 'neg': + if get_sentiment(data, t, sentence, False) == "neg": filters.append(title) - if s == 'neu': + if s == "neu": for title in data.titles: t = [title] - if get_sentiment(data, t, sentence, False) == 'neu': + if get_sentiment(data, t, sentence, False) == "neu": filters.append(title) # If search itself is a title if any(s in l for l in data.titles): @@ -173,13 +307,13 @@ def filter_data(inp, search, sentence, num): def generate_dict(data, num): if not num: num = 10 - q = Qrmine() + all_interviews = Content(data.content) q.print_dict(all_interviews, num) def generate_topics(data, assign, num): - q = Qrmine() + q.content = data q.process_content() q.print_topics() @@ -188,25 +322,26 @@ def generate_topics(data, assign, num): # def assign_topics(data): -# q = Qrmine() +# # q.content = data # q.process_content() # q.print_documents() + def get_categories_association(data, num): - q = Qrmine() + q.content = data click.echo(q.category_association(num)) click.echo("Frequent Itemsets") click.echo("---------------------------") + """ Function working at both levels """ def generate_categories(data, tags, num): - q = Qrmine() if len(tags) > 0: ct = 0 @@ -269,7 +404,9 @@ def get_sentiment(data, tags, sentence, verbose): if len(sentence) > 3: sent = s.sentiment_analyzer_scores(sentence.text) if verbose: - click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"]))) + click.echo( + "{:-<40} {}\n".format(sent["sentence"], str(sent["score"])) + ) click.echo(s.sentiment()) else: @@ -280,7 +417,7 @@ def get_sentiment(data, tags, sentence, verbose): return s.sentiment() else: all_interviews = Content(data.content) - doc = textacy.make_spacy_doc(all_interviews.doc) + doc = textacy.make_spacy_doc(all_interviews.doc, lang=all_interviews.lang) ## Sentiment s = Sentiment() @@ -289,7 +426,9 @@ def get_sentiment(data, tags, sentence, verbose): if len(sentence) > 3: sent = s.sentiment_analyzer_scores(sentence.text) if verbose: - click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"]))) + click.echo( + "{:-<40} {}\n".format(sent["sentence"], str(sent["score"])) + ) click.echo(s.sentiment()) else: @@ -309,7 +448,9 @@ def get_nnet(ml, n=3): ml.epochs = n ml.prepare_data(True) # Oversample ml.get_nnet_predictions() - click.echo("\n%s: %.2f%%" % (ml.model.metrics_names[1], ml.get_nnet_scores()[1] * 100)) + click.echo( + "\n%s: %.2f%%" % (ml.model.metrics_names[1], ml.get_nnet_scores()[1] * 100) + ) def get_svm(ml): @@ -348,7 +489,6 @@ def main(input_file): data = ReadData() data.read_file(input_file) - q = Qrmine() all_interviews = Content(data.content) q.content = data @@ -367,7 +507,11 @@ def main(input_file): x.append(sentence.text) sent = s.sentiment_analyzer_scores(sentence.text) click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"]))) - click.echo("{:-<40} {}\n".format(sentence.text, str(s.similarity(sentence.text, "Dummy sentence")))) + click.echo( + "{:-<40} {}\n".format( + sentence.text, str(s.similarity(sentence.text, "Dummy sentence")) + ) + ) ## Network n = Network() @@ -389,5 +533,5 @@ def main_routine(): cli() # run the main function -if __name__ == '__main__': +if __name__ == "__main__": main_routine() diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py index 12b75a3..a9ef33d 100644 --- a/src/qrmine/mlqrmine.py +++ b/src/qrmine/mlqrmine.py @@ -1,21 +1,49 @@ import numpy -from imblearn.over_sampling import RandomOverSampler -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense -from numpy import random, argsort, sqrt, array, ones from pandas import read_csv from sklearn.cluster import KMeans from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.neighbors import KDTree from random import randint +import logging +logger = logging.getLogger(__name__) +ML_INSTALLED = False + +try: + from xgboost import XGBClassifier + from mlxtend.frequent_patterns import apriori + from mlxtend.frequent_patterns import association_rules + + import torch.nn as nn + import torch.optim as optim + import torch + from torch.utils.data import DataLoader, TensorDataset + from imblearn.over_sampling import RandomOverSampler + + ML_INSTALLED = True + class NeuralNet(nn.Module): + def __init__(self, input_dim): + super(NeuralNet, self).__init__() + self.fc1 = nn.Linear(input_dim, 12) + self.fc2 = nn.Linear(12, 8) + self.fc3 = nn.Linear(8, 1) + self.relu = nn.ReLU() + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.sigmoid(self.fc3(x)) + return x +except ImportError: + logger.info( + "ML dependencies are not installed. Please install them by ```pip install qrmine[ml] to use ML features." + ) + -from xgboost import XGBClassifier -from mlxtend.frequent_patterns import apriori -from mlxtend.frequent_patterns import association_rules class MLQRMine(object): @@ -24,17 +52,18 @@ def __init__(self): self._seed = randint(1, 9) self._csvfile = "" self._titles = None + self._model = None self._dataset = None self._X = None self._y = None self._X_original = None self._y_original = None self._dataset_original = None - self._model = Sequential() self._sc = StandardScaler() self._vnum = 0 # Number of variables - self._classifier = XGBClassifier() - self._epochs = 10 + if ML_INSTALLED: + self._classifier = XGBClassifier() + self._epochs = 1 self._samplesize = 0 self._clusters = None @@ -130,7 +159,11 @@ def read_xy(self): def oversample(self): self._X_original = self._X self._y_original = self._y - ros = RandomOverSampler(random_state=0) + if ML_INSTALLED: + ros = RandomOverSampler(random_state=0) + else: + logger.info("ML dependencies are not installed. Please install them by ```pip install qrmine[ml] to use ML features.") + raise ImportError("ML dependencies are not installed. Please install them by ```pip install qrmine[ml] to use ML features.") X, y = ros.fit_resample(self._X, self._y) self._X = X self._y = y @@ -147,22 +180,57 @@ def prepare_data(self, oversample=False): self.oversample() def get_nnet_predictions(self): - self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu')) - self._model.add(Dense(8, kernel_initializer='uniform', activation='relu')) - self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) - # Compile model - self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) - # Fit the model - self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2) - - # calculate predictions - predictions = self._model.predict(self._X_original) - # round predictions - rounded = [round(x[0]) for x in predictions] + + self._model = NeuralNet(self._vnum) + criterion = nn.BCELoss() + optimizer = optim.Adam(self._model.parameters(), lr=0.001) + + # Convert data to PyTorch tensors + X_tensor = torch.tensor(self._X, dtype=torch.float32) + y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1) + + # Create a dataset and data loader + dataset = TensorDataset(X_tensor, y_tensor) + dataloader = DataLoader(dataset, batch_size=10, shuffle=True) + + # Train the model + for epoch in range(self._epochs): + for batch_X, batch_y in dataloader: + optimizer.zero_grad() + outputs = self._model(batch_X) + loss = criterion(outputs, batch_y) + loss.backward() + optimizer.step() + + # Calculate predictions + with torch.no_grad(): + predictions = self._model(torch.tensor(self._X_original, dtype=torch.float32)) + rounded = [round(x.item()) for x in predictions] + # print("Predictions: ", rounded) + # Calculate accuracy + correct = sum([1 for i in range(len(rounded)) if rounded[i] == self._y_original[i]]) + total = len(rounded) + accuracy = correct / total + print(f'Accuracy: {accuracy * 100:.2f}%') return rounded def get_nnet_scores(self): - return self._model.evaluate(self._X, self._y) + # evalute the pytorch model + self._model.eval() + X_tensor = torch.tensor(self._X, dtype=torch.float32) + y_tensor = torch.tensor(self._y, dtype=torch.float32).view(-1, 1) + dataset = TensorDataset(X_tensor, y_tensor) + dataloader = DataLoader(dataset, batch_size=10, shuffle=True) + correct = 0 + total = 0 + with torch.no_grad(): + for batch_X, batch_y in dataloader: + outputs = self._model(batch_X) + predicted = (outputs > 0.5).float() + total += batch_y.size(0) + correct += (predicted == batch_y).sum().item() + accuracy = correct / total + print(f'Accuracy: {accuracy * 100:.2f}%') def svm_confusion_matrix(self): """Generate confusion matrix for SVM @@ -211,7 +279,6 @@ def get_centroids(self, c=1): print("Mean") print(self._dataset.iloc[cluster_list, :].mean(axis=0)) - """ TODO: This is not working yet. use the ColumnTransformer instead of categorical_features diff --git a/src/qrmine/nlp_qrmine.py b/src/qrmine/nlp_qrmine.py index 44209c9..0d9b019 100644 --- a/src/qrmine/nlp_qrmine.py +++ b/src/qrmine/nlp_qrmine.py @@ -85,7 +85,7 @@ def category_basket(self, num=10): item_basket = [] for index, title in enumerate(self._content.titles): # QRMines content should be set content = self._content.documents[index] - this_record = Content(content) + this_record = Content(content, title) doc = textacy.make_spacy_doc(this_record.doc, lang=self._en) item_basket.append(self.print_categories(doc, num)) return item_basket @@ -115,7 +115,6 @@ def category_association(self, num=10): # 1 0.833333 (theory) # 2 0.666667 (theory, GT) - def unique(self,list1): # insert the list to the set @@ -170,17 +169,21 @@ def print_documents(self, top_n=2): print(self._corpus.docs[doc_idx]._.meta["title"], ':', topics) print("---------------------------\n") - def print_dict(self, content, num=10): + def print_dict(self, content, num=10, top_n=5): output = [] print("\n---Coding Dictionary---") output.append(("CATEGORY", "PROPERTY", "DIMENSION")) words = content.common_verbs(num) + _words = [] + for word, f1 in words: + _words.append(word) for word, f1 in words: - for attribute, f2 in content.attributes(word, 3): - for dimension, f3 in content.dimensions(attribute, 3): - output.append((word, attribute, dimension)) - word = '...' - attribute = '...' + for attribute, f2 in content.attributes(word, top_n): + for dimension, f3 in content.dimensions(attribute, top_n): + if dimension not in _words: + output.append((word, attribute, dimension)) + word = '...' + attribute = '...' self.print_table(output) print("---------------------------\n") @@ -195,7 +198,7 @@ def process_content(self): metadata['title'] = 'Empty' # self._corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True), # metadata=metadata) - #doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True) + # doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True) # 2-Jan-2020 textacy new version, breaking change # replace numbers with NUM, remove punct and convert to lower case @@ -216,7 +219,7 @@ def filter_content(self, titles): # self._corpus.add_text( # textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True), # metadata=metadata) - #doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True) + # doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True) # doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower() doc_text = preprocessing.replace.numbers(preprocessing.remove.punctuation(document)).lower() doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en) diff --git a/src/qrmine/readfiles.py b/src/qrmine/readfiles.py index a460795..187c201 100644 --- a/src/qrmine/readfiles.py +++ b/src/qrmine/readfiles.py @@ -1,4 +1,6 @@ import re +import requests +from pypdf import PdfReader class ReadData(object): @@ -37,48 +39,96 @@ def append(self, title, document): self._documents.append(document) self._content += document - def read_file(self, file_names): - if len(file_names) > 1: - for file_name in file_names: - with open(file_name, 'r') as f: - read_from_file = f.read() - self._content = re.sub('<[^<]+?>', '', read_from_file) - self._documents = re.split('.*?', read_from_file) - # Delete the last blank record - del self._documents[-1] - pattern = r"(.*?)" - _title = re.findall(pattern, read_from_file, flags=re.DOTALL)[0] - self._titles.append(_title) - f.close() - else: - file_name = file_names[0] - with open(file_name, 'r') as f: + def read_file(self, input, comma_separated_ignore_words=None): + # if input is a file name + if input.endswith(".txt"): + with open(input, "r") as f: read_from_file = f.read() - self._content = re.sub('<[^<]+?>', '', read_from_file) - self._documents = re.split('.*?', read_from_file) + # remove comma separated ignore words + if comma_separated_ignore_words: + for word in comma_separated_ignore_words.split(","): + read_from_file = re.sub( + r"\b" + word.strip() + r"\b", + "", + read_from_file, + flags=re.IGNORECASE, + ) + self._content = re.sub("<[^<]+?>", "", read_from_file) + self._documents = re.split(".*?", read_from_file) # Delete the last blank record del self._documents[-1] pattern = r"(.*?)" self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL) + # if input is a folder name + elif input.endswith("/"): + import os - """ - Combine duplicate topics using Dict - Currently supported only for single file. - """ + for file_name in os.listdir(input): + if file_name.endswith(".txt"): + with open(os.path.join(input, file_name), "r") as f: + read_from_file = f.read() + # remove comma separated ignore words + if comma_separated_ignore_words: + for word in comma_separated_ignore_words.split(","): + read_from_file = re.sub( + r"\b" + word.strip() + r"\b", + "", + read_from_file, + flags=re.IGNORECASE, + ) + self._content += read_from_file + self._documents.append(read_from_file) + self.titles.append(file_name) + if file_name.endswith(".pdf"): + with open(os.path.join(input, file_name), "rb") as f: + reader = PdfReader(f) + read_from_file = "" + for page in reader.pages: + read_from_file += page.extract_text() + # remove comma separated ignore words + if comma_separated_ignore_words: + for word in comma_separated_ignore_words.split(","): + read_from_file = re.sub( + r"\b" + word.strip() + r"\b", "", read_from_file, flags=re.IGNORECASE, + ) + self._content += read_from_file + self._documents.append(read_from_file) + self.titles.append(file_name) + # if input is a url + elif input.startswith("http://") or input.startswith("https://"): + response = requests.get(input) + if response.status_code == 200: + read_from_file = response.text + # remove comma separated ignore words + if comma_separated_ignore_words: + for word in comma_separated_ignore_words.split(","): + read_from_file = re.sub( + r"\b" + word.strip() + r"\b", + "", + read_from_file, + flags=re.IGNORECASE, + ) + self._content = read_from_file + self._documents.append(read_from_file) + self.titles.append(input) + else: + raise ValueError("Input must be a file name, folder name or url.") - doc_dict = {} - ct3 = 0 - for t in self._titles: - doc = doc_dict.get(t) - if doc: - doc_dict[t] = doc + self._documents[ct3] - else: - doc_dict[t] = self._documents[ct3] - ct3 += 1 - self._titles.clear() - self._documents.clear() - for t in doc_dict.keys(): - self._documents.append(doc_dict.get(t)) - self._titles.append(t) + """ + Combine duplicate topics using Dict + """ - f.close() + doc_dict = {} + ct3 = 0 + for t in self._titles: + doc = doc_dict.get(t) + if doc: + doc_dict[t] = doc + self._documents[ct3] + else: + doc_dict[t] = self._documents[ct3] + ct3 += 1 + self._titles.clear() + self._documents.clear() + for t in doc_dict.keys(): + self._documents.append(doc_dict.get(t)) + self._titles.append(t) diff --git a/src/qrmine/resources/df_dominant_topic.csv b/src/qrmine/resources/df_dominant_topic.csv new file mode 100644 index 0000000..115eb63 --- /dev/null +++ b/src/qrmine/resources/df_dominant_topic.csv @@ -0,0 +1,12 @@ +,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text +0,0,4,0.9903,"., GT, Strauss, ,, coding, +, ), Theory, seminal, (","['ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']" +1,1,1,0.7811,",, theory, ., GT, evaluation, structure, coding, +, ), (","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n\n', 'Strauss', 'Corbin', '(', '2', ')', 'recommend', 'strict', 'code', 'structure', 'elaborate', 'code', 'structure', 'datum', '.', 'seminal', 'article', 'Strauss', 'Corbin', 'describe', 'stage', 'coding', ':', 'open', 'coding', ',', 'axial', 'coding', ',', 'selective', 'coding', '.', 'classical', 'Grounded', 'Theory', 'offer', 'flexibility', 'Straussian', 'GT', 'easy', 'conduct', 'especially', 'new', 'researcher', '.', '\n']" +2,2,1,0.9783,",, theory, ., GT, evaluation, structure, coding, +, ), (","['\n', 'Glaser', 'Classical', 'GT', '(', '1', ')', 'provide', 'guideline', 'evaluation', 'GT', 'methodology', '.', 'evaluation', 'base', 'theory', 'fit', 'datum', ',', 'theory', 'understandable', 'non', '-', 'professional', ',', 'theory', 'generalizable', 'situation', ',', 'theory', 'offer', 'control', 'structure', 'process', '.', '\n']" +3,3,3,0.9952,"., ,, coding, category, open, QRMine, datum, researcher, code, GT","['\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n\n', 'open', 'coding', 'step', 'datum', 'break', 'analytically', ',', 'conceptually', 'similar', 'chunk', 'group', 'category', 'subcategorie', '.', 'difference', 'category', 'establish', ',', 'property', 'dimension', 'dissect', '.', 'code', 'GT', 'overwhelming', ',', 'scale', 'category', 'open', 'coding', 'difficult', '.', 'lead', 'generation', 'low', '-', 'level', 'theory', '.', 'natural', 'language', 'processing', ',', 'information', 'system', 'help', 'young', 'researcher', 'sense', 'datum', 'collect', 'stage', 'open', 'coding', '.', 'QRMine', 'software', 'suite', 'support', 'qualitative', 'researcher', 'NLP', '.', ' ', 'QRMine', 'opensource', 'available', '.', 'idea', ',', 'comment', 'pull', 'request', 'welcome', '.', 'jupyter', 'notebook', 'show', 'feature', 'QRMine', '.', '\n']" +4,4,4,0.9793,"., GT, Strauss, ,, coding, +, ), Theory, seminal, (","['\n', 'ground', 'theory', '(', 'GT', ')', 'emerge', 'research', 'methodology', 'medical', 'sociology', 'follow', 'seminal', 'work', 'Barney', 'Glaser', 'Anselm', 'Strauss', '.', ',', 'later', 'develop', 'different', 'view', 'original', 'contribution', 'supporter', 'lead', 'establishment', 'classical', 'Glaserian', 'GT', 'pragmatic', 'straussian', 'Grounded', 'Theory', '.', '\n']" +5,5,2,0.9712,"category, comparison, incident, ,, +, involve, refine, identify, emergence, constant","['\n', 'constant', 'comparison', 'central', 'classical', 'Grounded', 'Theory', ',', 'involve', 'incident', 'incident', 'comparison', 'identify', 'category', ',', 'incident', 'category', 'comparison', 'refine', 'category', 'category', 'category', 'comparison', 'emergence', 'theory', '.', '\n']" diff --git a/src/qrmine/utils.py b/src/qrmine/utils.py new file mode 100644 index 0000000..4d6776f --- /dev/null +++ b/src/qrmine/utils.py @@ -0,0 +1,40 @@ +import re +import requests +import os + + +class QRUtils(object): + def __init__(self): + pass + + @staticmethod + def read_covid_narratives(output_folder): + os.makedirs(output_folder, exist_ok=True) + for doc_count in range(1, 115): + url = f"https://covidstories.omeka.net/items/show/{doc_count}" + html = requests.get(url).text + # Extract ' + # find first match + match = re.search(pattern, html) + if match: + # Extract the URL + file_url = match.group(1) + # sanitize the URL + file_url = file_url.replace("&", "&") + print(f"Downloading file from {file_url}") + # Download the file + response = requests.get(file_url) + # Save the file to the output folder + with open( + os.path.join(output_folder, f"doc_{doc_count}.pdf"), "wb" + ) as f: + f.write(response.content) + else: + print(f"No match found for document {doc_count}") + + +if __name__ == "__main__": + # Example usage + qr_utils = QRUtils() + qr_utils.read_covid_narratives("/tmp/covid_narratives") diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py new file mode 100644 index 0000000..f4e93f2 --- /dev/null +++ b/src/qrmine/visualize.py @@ -0,0 +1,450 @@ +""" +Copyright (C) 2025 Bell Eapen + +This file is part of qrmine. + +qrmine is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +qrmine is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with qrmine. If not, see . +""" + +from collections import Counter + +import matplotlib.colors as mcolors +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from matplotlib.patches import Rectangle +from matplotlib.ticker import FuncFormatter +from sklearn.manifold import TSNE +from wordcloud import STOPWORDS, WordCloud + + +class QRVisualize: + def __init__(self, data: pd.DataFrame = None): + """ + Initialize the QRVisualize class with a DataFrame. + + Parameters: + data (pd.DataFrame): The DataFrame containing the data to visualize. + """ + self.data = data + + def plot_frequency_distribution_of_words(self, df=None, folder_path=None): + if df is None: + df = self.data + doc_lens = [len(d) for d in df.Text] + + # Plot + plt.figure(figsize=(16, 7), dpi=160) + plt.hist(doc_lens, bins=1000, color="navy") + plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens)))) + plt.text(750, 90, "Median : " + str(round(np.median(doc_lens)))) + plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens)))) + plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01)))) + plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99)))) + + plt.gca().set( + xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count" + ) + plt.tick_params(size=16) + plt.xticks(np.linspace(0, 1000, 9)) + plt.title("Distribution of Document Word Counts", fontdict=dict(size=22)) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def plot_distribution_by_topic(self, df=None, folder_path=None): + if df is None: + df = self.data + # Plot + cols = [ + color for name, color in mcolors.TABLEAU_COLORS.items() + ] # more colors: 'mcolors.XKCD_COLORS' + + fig, axes = plt.subplots( + 2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True + ) + + for i, ax in enumerate(axes.flatten()): + df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :] + doc_lens = [len(d) for d in df_dominant_topic_sub.Text] + ax.hist(doc_lens, bins=1000, color=cols[i]) + ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i]) + sns.kdeplot( + doc_lens, color="black", fill=False, ax=ax.twinx(), warn_singular=False + ) + ax.set(xlim=(0, 1000), xlabel="Document Word Count") + ax.set_ylabel("Number of Documents", color=cols[i]) + ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i])) + + fig.tight_layout() + fig.subplots_adjust(top=0.90) + plt.xticks(np.linspace(0, 1000, 9)) + fig.suptitle( + "Distribution of Document Word Counts by Dominant Topic", fontsize=22 + ) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def plot_wordcloud(self, topics=None, folder_path=None): + cols = [ + color for name, color in mcolors.TABLEAU_COLORS.items() + ] # more colors: 'mcolors.XKCD_COLORS' + + cloud = WordCloud( + stopwords=STOPWORDS, + background_color="white", + width=250, + height=180, + max_words=5, + colormap="tab10", + color_func=lambda *args, **kwargs: cols[i], + prefer_horizontal=1.0, + ) + + fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True) + + for i, ax in enumerate(axes.flatten()): + fig.add_subplot(ax) + topic_words = dict(topics[i][1]) + cloud.generate_from_frequencies(topic_words, max_font_size=300) + plt.gca().imshow(cloud) + plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16)) + plt.gca().axis("off") + + plt.subplots_adjust(wspace=0, hspace=0) + plt.axis("off") + plt.margins(x=0, y=0) + plt.tight_layout() + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def plot_importance(self, topics=None, processed_docs=None, folder_path=None): + data_flat = [w for w_list in processed_docs for w in w_list] + counter = Counter(data_flat) + + out = [] + for i, topic in topics: + for word, weight in topic: + out.append([word, i, weight, counter[word]]) + + df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"]) + + # Plot Word Count and Weights of Topic Keywords + fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160) + cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] + for i, ax in enumerate(axes.flatten()): + ax.bar( + x="word", + height="word_count", + data=df.loc[df.topic_id == i, :], + color=cols[i], + width=0.5, + alpha=0.3, + label="Word Count", + ) + ax_twin = ax.twinx() + ax_twin.bar( + x="word", + height="importance", + data=df.loc[df.topic_id == i, :], + color=cols[i], + width=0.2, + label="Weights", + ) + ax.set_ylabel("Word Count", color=cols[i]) + ax_twin.set_ylim(0, 0.030) + ax.set_ylim(0, 3500) + ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16) + ax.tick_params(axis="y", left=False) + ax.set_xticklabels( + df.loc[df.topic_id == i, "word"], + rotation=30, + horizontalalignment="right", + ) + ax.legend(loc="upper left") + ax_twin.legend(loc="upper right") + + fig.tight_layout(w_pad=2) + fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05) + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13, folder_path=None): + if lda_model is None: + raise ValueError("LDA model is not provided.") + corp = corpus[start:end] + mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()] + + fig, axes = plt.subplots( + end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160 + ) + axes[0].axis("off") + for i, ax in enumerate(axes): + try: + if i > 0: + corp_cur = corp[i - 1] + topic_percs, wordid_topics, _ = lda_model[corp_cur] + word_dominanttopic = [ + (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics + ] + ax.text( + 0.01, + 0.5, + "Doc " + str(i - 1) + ": ", + verticalalignment="center", + fontsize=16, + color="black", + transform=ax.transAxes, + fontweight=700, + ) + + # Draw Rectange + topic_percs_sorted = sorted( + topic_percs, key=lambda x: (x[1]), reverse=True + ) + ax.add_patch( + Rectangle( + (0.0, 0.05), + 0.99, + 0.90, + fill=None, + alpha=1, + color=mycolors[topic_percs_sorted[0][0]], + linewidth=2, + ) + ) + + word_pos = 0.06 + for j, (word, topics) in enumerate(word_dominanttopic): + if j < 14: + ax.text( + word_pos, + 0.5, + word, + horizontalalignment="left", + verticalalignment="center", + fontsize=16, + color=mycolors[topics], + transform=ax.transAxes, + fontweight=700, + ) + word_pos += 0.009 * len( + word + ) # to move the word for the next iter + ax.axis("off") + ax.text( + word_pos, + 0.5, + ". . .", + horizontalalignment="left", + verticalalignment="center", + fontsize=16, + color="black", + transform=ax.transAxes, + ) + except: + continue + + plt.subplots_adjust(wspace=0, hspace=0) + plt.suptitle( + "Sentence Topic Coloring for Documents: " + + str(start) + + " to " + + str(end - 2), + fontsize=22, + y=0.95, + fontweight=700, + ) + plt.tight_layout() + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def _cluster_chart(self, lda_model=None, corpus=None, n_topics=3, folder_path=None): + # Get topic weights + topic_weights = [] + for i, row_list in enumerate(lda_model[corpus]): + topic_weights.append([w for i, w in row_list[0]]) + + # Array of topic weights + arr = pd.DataFrame(topic_weights).fillna(0).values + + # Keep the well separated points (optional) + arr = arr[np.amax(arr, axis=1) > 0.35] + + # Dominant topic number in each doc + topic_num = np.argmax(arr, axis=1) + + # tSNE Dimension Reduction + tsne_model = TSNE( + n_components=2, verbose=1, random_state=0, angle=0.99, init="pca" + ) + tsne_lda = tsne_model.fit_transform(arr) + + # Plot + plt.figure(figsize=(16, 10), dpi=160) + for i in range(n_topics): + plt.scatter( + tsne_lda[topic_num == i, 0], + tsne_lda[topic_num == i, 1], + label=str(i), + alpha=0.5, + ) + plt.title("t-SNE Clustering of Topics", fontsize=22) + plt.xlabel("t-SNE Dimension 1", fontsize=16) + plt.ylabel("t-SNE Dimension 2", fontsize=16) + plt.legend(title="Topic Number", loc="upper right") + plt.show() + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def most_discussed_topics( + self, lda_model, dominant_topics, topic_percentages, folder_path=None + ): + + # Distribution of Dominant Topics in Each Document + df = pd.DataFrame(dominant_topics, columns=["Document_Id", "Dominant_Topic"]) + dominant_topic_in_each_doc = df.groupby("Dominant_Topic").size() + df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame( + name="count" + ).reset_index() + + # Total Topic Distribution by actual weight + topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages]) + df_topic_weightage_by_doc = ( + topic_weightage_by_doc.sum().to_frame(name="count").reset_index() + ) + + # Top 3 Keywords for each Topic + topic_top3words = [ + (i, topic) + for i, topics in lda_model.show_topics(formatted=False) + for j, (topic, wt) in enumerate(topics) + if j < 3 + ] + + df_top3words_stacked = pd.DataFrame( + topic_top3words, columns=["topic_id", "words"] + ) + df_top3words = df_top3words_stacked.groupby("topic_id").agg(", \n".join) + df_top3words.reset_index(level=0, inplace=True) + + # Plot + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True) + + # Topic Distribution by Dominant Topics + ax1.bar( + x="Dominant_Topic", + height="count", + data=df_dominant_topic_in_each_doc, + width=0.5, + color="firebrick", + ) + ax1.set_xticks( + range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()) + ) + tick_formatter = FuncFormatter( + lambda x, pos: "Topic " + + str(x) + + "\n" + + df_top3words.loc[df_top3words.topic_id == x, "words"].values[0] + ) + ax1.xaxis.set_major_formatter(tick_formatter) + ax1.set_title("Number of Documents by Dominant Topic", fontdict=dict(size=10)) + ax1.set_ylabel("Number of Documents") + ax1.set_ylim(0, 1000) + + # Topic Distribution by Topic Weights + ax2.bar( + x="index", + height="count", + data=df_topic_weightage_by_doc, + width=0.5, + color="steelblue", + ) + ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__())) + ax2.xaxis.set_major_formatter(tick_formatter) + ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10)) + + plt.show() + + # save + if folder_path: + plt.savefig(folder_path) + plt.close() + + def update_annot(self, ind): + norm = plt.Normalize(1,4) + cmap = plt.cm.RdYlGn + pos = self.sc.get_offsets()[ind["ind"][0]] + self.annot.xy = pos + text = "{}, {}".format( + " ".join(list(map(str, ind["ind"]))), " ".join([self.names[n] for n in ind["ind"]]) + ) + self.annot.set_text(text) + self.annot.get_bbox_patch().set_facecolor(cmap(norm(c[ind["ind"][0]]))) + self.annot.get_bbox_patch().set_alpha(0.4) + + def hover(self, event): + vis = self.annot.get_visible() + if event.inaxes == self.ax: + cont, ind = self.sc.contains(event) + if cont: + self.update_annot(ind) + self.annot.set_visible(True) + self.fig.canvas.draw_idle() + else: + if vis: + self.annot.set_visible(False) + self.fig.canvas.draw_idle() + + # https://stackoverflow.com/questions/7908636/how-to-add-hovering-annotations-to-a-plot + def cluster_chart (self, data, folder_path=None): + # Scatter plot for Text Cluster Prediction + plt.figure(figsize=(6, 6)) + self.fig, self.ax = plt.subplots() + self.names = data['title'] + self.sc = plt.scatter(data['x'], data['y'], c=data['colour'], s=36, edgecolors='black', linewidths=0.75) + self.annot = self.ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points", + bbox=dict(boxstyle="round", fc="w"), + arrowprops=dict(arrowstyle="->")) + self.annot.set_visible(False) + plt.title('Text Cluster Prediction') + plt.axis('off') # Optional: Remove axes for a cleaner look + plt.colorbar(self.sc, label='Colour') # Add colorbar if needed + self.fig.canvas.mpl_connect("motion_notify_event", self.hover) + plt.show() + # save + if folder_path: + # annotate with data['title'] + for i, txt in enumerate(data['title']): + plt.annotate(txt, (data['x'][i], data['y'][i]), fontsize=8, ha='right', va='bottom') + plt.savefig(folder_path) + plt.close() diff --git a/test.py b/test.py new file mode 100644 index 0000000..a5c4b31 --- /dev/null +++ b/test.py @@ -0,0 +1,33 @@ +import spacy + +# Load spaCy model +nlp = spacy.load("en_core_web_sm") + +# Sample documents +documents = [ + "Natural language processing is a field of AI.", + "Topic modeling helps in uncovering the main themes in a collection of documents.", + "Semantic clustering groups similar documents together based on meaning.", + "SpaCy is a popular NLP library.", + "Gensim is commonly used for topic modeling.", +] + + +# Preprocess the documents using spaCy +def preprocess(doc): + # Tokenize and preprocess each document + doc = nlp(doc) + print(f"Original Document: {doc}") + # Lemmatize and remove stop words + tokens = [token.lemma_ for token in doc if not token.is_stop] + print(f"Processed Tokens: {tokens}") + return tokens + + +# Apply preprocessing to each document +processed_docs = [preprocess(doc) for doc in documents] + + +# Print the processed documents +for i, doc in enumerate(processed_docs): + print(f"Document {i + 1}: {doc}") \ No newline at end of file diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 4ad331d..2c78676 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -1,32 +1,52 @@ import pytest - @pytest.fixture def corpus_fixture(): from pkg_resources import resource_filename from src.qrmine import ReadData + corpus = ReadData() - file_path = resource_filename('src.qrmine.resources', 'interview.txt') - corpus.read_file([file_path]) + file_path = resource_filename("src.qrmine.resources", "interview.txt") + corpus.read_file(file_path) return corpus +@pytest.fixture +def content(): + from src.qrmine import Content + + _content = Content() + return _content + + # instannce of Qrmine as fixture @pytest.fixture def q(): from src.qrmine import Qrmine + _q = Qrmine() return _q + +@pytest.fixture +def cluster(content): + from src.qrmine import ClusterDocs + + _cluster = ClusterDocs(content) + return _cluster + + # Ref: https://docs.pytest.org/en/latest/capture.html def test_generate_dict(corpus_fixture, capsys, q): from src.qrmine import Content + num = 10 - all_interviews = Content(corpus_fixture.content) + all_interviews = Content(corpus_fixture.content, corpus_fixture.titles) q.print_dict(all_interviews, num) captured = capsys.readouterr() print(captured.out) - assert 'code' in captured.out + assert "code" in captured.out + def test_generate_topics(corpus_fixture, capsys, q): q.content = corpus_fixture @@ -34,22 +54,57 @@ def test_generate_topics(corpus_fixture, capsys, q): q.print_topics() captured = capsys.readouterr() print(captured.out) - assert 'TOPIC' in captured.out + assert "TOPIC" in captured.out + def test_category_basket(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_basket()) captured = capsys.readouterr() print(captured.out) - assert 'theory' in captured.out + assert "theory" in captured.out + def test_category_association(corpus_fixture, capsys, q): q.content = corpus_fixture print(q.category_association()) captured = capsys.readouterr() print(captured.out) - assert 'theory' in captured.out + assert "theory" in captured.out + + +def test_cluster_topics(corpus_fixture, capsys, cluster): + cluster.documents = corpus_fixture.documents + cluster.titles = corpus_fixture.titles + + cluster.print_topics() + captured = capsys.readouterr() + print(captured.out) + assert "Topic" in captured.out + cluster.print_clusters() + captured = capsys.readouterr() + print(captured.out) + assert "Document" in captured.out + print("LDA Model") + print(cluster.build_lda_model()) + print("LDA Model Topics") + print(cluster.topics_per_document()) + # Format + df_dominant_topic = cluster.format_topics_sentences() + # Format the output + df_dominant_topic.columns = [ + "Document_No", + "Dominant_Topic", + "Topic_Perc_Contrib", + "Keywords", + "Text", + ] + print(df_dominant_topic.head(10)) + assert "Document_No" in df_dominant_topic.columns + df_sorted = cluster.most_representative_docs() + print(df_sorted.head(10)) + assert "Dominant_Topic" in df_sorted.columns diff --git a/tests/test_num.py b/tests/test_num.py index f0c53cd..ac7a139 100644 --- a/tests/test_num.py +++ b/tests/test_num.py @@ -9,7 +9,7 @@ def ml_fixture(): ml = MLQRMine() file_path = resource_filename('src.qrmine.resources', 'numeric.csv') ml.csvfile = file_path - return ml + return ml @@ -19,7 +19,7 @@ def test_nn(ml_fixture, capsys): ml_fixture.prepare_data(True) ml_fixture.get_nnet_predictions() captured = capsys.readouterr() - assert 'accuracy' in captured.out + assert 'Accuracy' in captured.out def test_svm(ml_fixture, capsys): ml_fixture.prepare_data(True) diff --git a/tests/test_readfiles.py b/tests/test_readfiles.py index aff3a5d..963ed90 100644 --- a/tests/test_readfiles.py +++ b/tests/test_readfiles.py @@ -8,8 +8,8 @@ def corpus_fixture(): from src.qrmine import ReadData corpus = ReadData() file_path = resource_filename('src.qrmine.resources', 'interview.txt') - corpus.read_file([file_path]) - return corpus + corpus.read_file(file_path) + return corpus def test_content(corpus_fixture): diff --git a/tests/test_visualize.py b/tests/test_visualize.py new file mode 100644 index 0000000..41f7145 --- /dev/null +++ b/tests/test_visualize.py @@ -0,0 +1,114 @@ +import pytest +import pandas as pd +from src.qrmine.visualize import QRVisualize + + +@pytest.fixture +def v(): + from pkg_resources import resource_filename + + file_path = resource_filename("src.qrmine.resources", "df_dominant_topic.csv") + data = pd.read_csv(file_path) + _v = QRVisualize(data) + return _v + + +@pytest.fixture +def topics(): + return [ + ( + 0, + [ + (".", 0.095292516), + (",", 0.053392828), + ("category", 0.032462463), + ("coding", 0.032456465), + ("open", 0.032437164), + ("QRMine", 0.03243305), + ("datum", 0.021980358), + ("researcher", 0.021978099), + ("theory", 0.011536299), + ("GT", 0.011533132), + ], + ), + ( + 1, + [ + (".", 0.007783216), + (",", 0.007773952), + ("open", 0.007728422), + ("researcher", 0.0077227736), + ("coding", 0.007722049), + ("category", 0.007721938), + ("datum", 0.007717547), + ("QRMine", 0.007716193), + ("dissect", 0.0077070068), + ("support", 0.0077060354), + ], + ), + ( + 2, + [ + (",", 0.05126711), + (".", 0.05125151), + ("theory", 0.038604487), + ("category", 0.03227912), + ("GT", 0.032278605), + ("\n", 0.029119665), + ("comparison", 0.025947908), + ("coding", 0.025941858), + ("incident", 0.019622542), + (")", 0.019619444), + ], + ), + ( + 3, + [ + (".", 0.007849805), + (",", 0.007837688), + ("theory", 0.00781459), + ("coding", 0.0078089647), + ("category", 0.0077514737), + ("GT", 0.0077493717), + ("datum", 0.007742789), + ("open", 0.0077355755), + ("\n", 0.0077245855), + ("researcher", 0.0077191954), + ], + ), + ( + 4, + [ + (",", 0.007834569), + (".", 0.007812336), + ("coding", 0.0077863215), + ("category", 0.007759207), + ("theory", 0.0077459146), + ("GT", 0.0077370973), + ("code", 0.0077265715), + ("datum", 0.007720947), + ("open", 0.007720898), + ("comparison", 0.007720567), + ], + ), + ] + + +def test_frequency_distribution_of_words(v, capsys): + v.plot_frequency_distribution_of_words( + v.data + ) + captured = capsys.readouterr() + print(captured.out) + + +def test_distribution_by_topic(v, capsys): + v.plot_distribution_by_topic(v.data) + captured = capsys.readouterr() + print(captured.out) + + +def test_plot_wordcloud(v, topics, capsys): + v.plot_wordcloud(topics) + captured = capsys.readouterr() + print(captured.out) diff --git a/tox.ini b/tox.ini index 3eb707d..e13bd76 100644 --- a/tox.ini +++ b/tox.ini @@ -3,26 +3,71 @@ # THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS! [tox] -minversion = 2.4 -envlist = py311, integration +minversion = 3.15 +envlist = default, clean, build, docs, doctests + [testenv] -setenv = TOXINIDIR = {toxinidir} -deps = - -rrequirements.txt - -rdev-requirements.txt +description = invoke pytest to run automated tests +isolated_build = True +setenv = + TOXINIDIR = {toxinidir} +passenv = + HOME +extras = + testing,ml commands = python -m spacy download en_core_web_sm - py.test {posargs} -extras = - all - testing + pytest {posargs} + + +[testenv:{clean,build}] +description = + Build (or clean) the package in isolation according to instructions in: + https://setuptools.readthedocs.io/en/latest/build_meta.html#how-to-use-it + https://github.com/pypa/pep517/issues/91 + https://github.com/pypa/build +# NOTE: build is still experimental, please refer to the links for updates/issues +skip_install = True +changedir = {toxinidir} +commands = + clean: python -c 'from shutil import rmtree; rmtree("build", True); rmtree("dist", True)' + build: python setup.py sdist +# By default `build` produces wheels, you can also explicitly use the flags `--sdist` and `--wheel` + + +[testenv:{docs,doctests}] +description = invoke sphinx-build to build the docs/run doctests +setenv = + DOCSDIR = {toxinidir}/docs + BUILDDIR = {toxinidir}/docs/_build + docs: BUILD = html + doctests: BUILD = doctest +deps = + -r {toxinidir}/docs/requirements.txt + # ^ requirements.txt shared with Read The Docs +commands = + sphinx-build -b {env:BUILD} -d "{env:BUILDDIR}/doctrees" "{env:DOCSDIR}" "{env:BUILDDIR}/{env:BUILD}" {posargs} + + +[testenv:publish] +description = + Publish the package you have been developing to a package index server. + By default, it uses testpypi. If you really want to publish your package + to be publicly accessible in PyPI, use the `-- --repository pypi` option. +skip_install = True +changedir = {toxinidir} +passenv = + TWINE_USERNAME + TWINE_PASSWORD + TWINE_REPOSITORY +deps = twine +commands = + python -m twine check dist/* + python -m twine upload {posargs:--repository testpypi} dist/* [testenv:integration] setenv = TOXINIDIR = {toxinidir} -deps = - -rrequirements.txt - -rdev-requirements.txt commands = python -m spacy download en_core_web_sm python qrminer.py \ No newline at end of file From fb3c8da86e8c577f5016f660d4aab92e18b953c0 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Sun, 4 May 2025 20:12:46 -0500 Subject: [PATCH 2/5] Update src/qrmine/mlqrmine.py return accuracy Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/qrmine/mlqrmine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/qrmine/mlqrmine.py b/src/qrmine/mlqrmine.py index a9ef33d..6bcef33 100644 --- a/src/qrmine/mlqrmine.py +++ b/src/qrmine/mlqrmine.py @@ -231,6 +231,7 @@ def get_nnet_scores(self): correct += (predicted == batch_y).sum().item() accuracy = correct / total print(f'Accuracy: {accuracy * 100:.2f}%') + return accuracy def svm_confusion_matrix(self): """Generate confusion matrix for SVM From a97c56e68aa774aba12b313f59b0b43cc6e8a5a5 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Sun, 4 May 2025 20:23:25 -0500 Subject: [PATCH 3/5] fix: update plot display to non-blocking for smoother visualization --- src/qrmine/visualize.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/qrmine/visualize.py b/src/qrmine/visualize.py index f4e93f2..729d30b 100644 --- a/src/qrmine/visualize.py +++ b/src/qrmine/visualize.py @@ -60,7 +60,7 @@ def plot_frequency_distribution_of_words(self, df=None, folder_path=None): plt.tick_params(size=16) plt.xticks(np.linspace(0, 1000, 9)) plt.title("Distribution of Document Word Counts", fontdict=dict(size=22)) - plt.show() + plt.show(block=False) # save if folder_path: plt.savefig(folder_path) @@ -96,7 +96,7 @@ def plot_distribution_by_topic(self, df=None, folder_path=None): fig.suptitle( "Distribution of Document Word Counts by Dominant Topic", fontsize=22 ) - plt.show() + plt.show(block=False) # save if folder_path: plt.savefig(folder_path) @@ -132,7 +132,7 @@ def plot_wordcloud(self, topics=None, folder_path=None): plt.axis("off") plt.margins(x=0, y=0) plt.tight_layout() - plt.show() + plt.show(block=False) # save if folder_path: plt.savefig(folder_path) @@ -186,7 +186,7 @@ def plot_importance(self, topics=None, processed_docs=None, folder_path=None): fig.tight_layout(w_pad=2) fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05) - plt.show() + plt.show(block=False) # save if folder_path: plt.savefig(folder_path) @@ -279,7 +279,7 @@ def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13, folder_pa fontweight=700, ) plt.tight_layout() - plt.show() + plt.show(block=False) # save if folder_path: plt.savefig(folder_path) @@ -319,7 +319,7 @@ def _cluster_chart(self, lda_model=None, corpus=None, n_topics=3, folder_path=No plt.xlabel("t-SNE Dimension 1", fontsize=16) plt.ylabel("t-SNE Dimension 2", fontsize=16) plt.legend(title="Topic Number", loc="upper right") - plt.show() + plt.show(block=False) # save if folder_path: plt.savefig(folder_path) @@ -393,7 +393,7 @@ def most_discussed_topics( ax2.xaxis.set_major_formatter(tick_formatter) ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10)) - plt.show() + plt.show(block=False) # save if folder_path: @@ -440,7 +440,7 @@ def cluster_chart (self, data, folder_path=None): plt.axis('off') # Optional: Remove axes for a cleaner look plt.colorbar(self.sc, label='Colour') # Add colorbar if needed self.fig.canvas.mpl_connect("motion_notify_event", self.hover) - plt.show() + plt.show(block=False) # save if folder_path: # annotate with data['title'] From 340b1e49f18584f99d06a6249c6e359a6e9b8839 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Sun, 4 May 2025 20:36:29 -0500 Subject: [PATCH 4/5] fix: add missing spacy model download command in docs test environment --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index e13bd76..14055f6 100644 --- a/tox.ini +++ b/tox.ini @@ -47,6 +47,7 @@ deps = -r {toxinidir}/docs/requirements.txt # ^ requirements.txt shared with Read The Docs commands = + python -m spacy download en_core_web_sm sphinx-build -b {env:BUILD} -d "{env:BUILDDIR}/doctrees" "{env:DOCSDIR}" "{env:BUILDDIR}/{env:BUILD}" {posargs} From f5728889ca6522df2712aa68ae1023731588a607 Mon Sep 17 00:00:00 2001 From: Bell Eapen Date: Sun, 4 May 2025 20:40:01 -0500 Subject: [PATCH 5/5] fix: exclude notebooks from package find configuration --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ac97fc2..397a880 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,7 +116,7 @@ platforms = ["any"] [tool.setuptools.packages.find] where = [ "src"] -exclude = [ "tests", "notes", "docs"] +exclude = [ "tests", "notes", "docs", "notebooks"] namespaces = true [tool.pytest.ini_options]