Skip to content
This repository was archived by the owner on Oct 13, 2025. It is now read-only.

Commit ddc6b96

Browse files
committed
feat: add copyright notice and improve code formatting in cluster and visualize modules; enhance test readability in test files
1 parent fc49b94 commit ddc6b96

File tree

4 files changed

+309
-91
lines changed

4 files changed

+309
-91
lines changed

src/qrmine/cluster.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,30 @@
1+
"""
2+
Copyright (C) 2025 Bell Eapen
3+
4+
This file is part of qrmine.
5+
6+
qrmine is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU General Public License as published by
8+
the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
qrmine is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU General Public License for more details.
15+
16+
You should have received a copy of the GNU General Public License
17+
along with qrmine. If not, see <https://www.gnu.org/licenses/>.
18+
"""
19+
20+
from pprint import pprint
21+
22+
import pandas as pd
123
import spacy
224
from gensim import corpora
325
from gensim.models.ldamodel import LdaModel
4-
import pandas as pd
5-
from pprint import pprint
26+
27+
628
class ClusterDocs:
729

830
def __init__(self, documents=[], titles=[]):
@@ -74,7 +96,10 @@ def process(self):
7496
def build_lda_model(self):
7597
if self._lda_model is None:
7698
self._lda_model = LdaModel(
77-
self._corpus, num_topics=self._num_topics, id2word=self._dictionary, passes=self._passes
99+
self._corpus,
100+
num_topics=self._num_topics,
101+
id2word=self._dictionary,
102+
passes=self._passes,
78103
)
79104
return self._lda_model.show_topics(formatted=False)
80105

@@ -88,9 +113,13 @@ def print_clusters(self):
88113
if self._lda_model is None:
89114
self.build_lda_model()
90115
# Perform semantic clustering
91-
for i, doc in enumerate(self._processed_docs): # Changed from get_processed_docs() to _documents
116+
for i, doc in enumerate(
117+
self._processed_docs
118+
): # Changed from get_processed_docs() to _documents
92119
bow = self._dictionary.doc2bow(doc)
93-
print(f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}")
120+
print(
121+
f"Document {self._titles[i]} belongs to topic: {self._lda_model.get_document_topics(bow)}"
122+
)
94123

95124
def format_topics_sentences(self):
96125
self.build_lda_model()
@@ -107,12 +136,24 @@ def format_topics_sentences(self):
107136
if j == 0: # => dominant topic
108137
wp = self._lda_model.show_topic(topic_num)
109138
topic_keywords = ", ".join([word for word, prop in wp])
110-
new_row = pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]],
111-
columns=["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"])
112-
sent_topics_df = pd.concat([sent_topics_df, new_row], ignore_index=True)
139+
new_row = pd.DataFrame(
140+
[[int(topic_num), round(prop_topic, 4), topic_keywords]],
141+
columns=[
142+
"Dominant_Topic",
143+
"Perc_Contribution",
144+
"Topic_Keywords",
145+
],
146+
)
147+
sent_topics_df = pd.concat(
148+
[sent_topics_df, new_row], ignore_index=True
149+
)
113150
else:
114151
break
115-
sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]
152+
sent_topics_df.columns = [
153+
"Dominant_Topic",
154+
"Perc_Contribution",
155+
"Topic_Keywords",
156+
]
116157

117158
# Add original text to the end of the output
118159
contents = pd.Series(self._processed_docs)
@@ -136,7 +177,6 @@ def most_representative_docs(self):
136177

137178
return sent_topics_sorteddf_mallet
138179

139-
140180
def topics_per_document(self, start=0, end=1):
141181
corpus_sel = self._corpus[start:end]
142182
dominant_topics = []

0 commit comments

Comments
 (0)