1+ """
2+ Copyright (C) 2025 Bell Eapen
3+
4+ This file is part of qrmine.
5+
6+ qrmine is free software: you can redistribute it and/or modify
7+ it under the terms of the GNU General Public License as published by
8+ the Free Software Foundation, either version 3 of the License, or
9+ (at your option) any later version.
10+
11+ qrmine is distributed in the hope that it will be useful,
12+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+ GNU General Public License for more details.
15+
16+ You should have received a copy of the GNU General Public License
17+ along with qrmine. If not, see <https://www.gnu.org/licenses/>.
18+ """
19+
20+ from pprint import pprint
21+
22+ import pandas as pd
123import spacy
224from gensim import corpora
325from gensim .models .ldamodel import LdaModel
4- import pandas as pd
5- from pprint import pprint
26+
27+
628class ClusterDocs :
729
830 def __init__ (self , documents = [], titles = []):
@@ -74,7 +96,10 @@ def process(self):
7496 def build_lda_model (self ):
7597 if self ._lda_model is None :
7698 self ._lda_model = LdaModel (
77- self ._corpus , num_topics = self ._num_topics , id2word = self ._dictionary , passes = self ._passes
99+ self ._corpus ,
100+ num_topics = self ._num_topics ,
101+ id2word = self ._dictionary ,
102+ passes = self ._passes ,
78103 )
79104 return self ._lda_model .show_topics (formatted = False )
80105
@@ -88,9 +113,13 @@ def print_clusters(self):
88113 if self ._lda_model is None :
89114 self .build_lda_model ()
90115 # Perform semantic clustering
91- for i , doc in enumerate (self ._processed_docs ): # Changed from get_processed_docs() to _documents
116+ for i , doc in enumerate (
117+ self ._processed_docs
118+ ): # Changed from get_processed_docs() to _documents
92119 bow = self ._dictionary .doc2bow (doc )
93- print (f"Document { self ._titles [i ]} belongs to topic: { self ._lda_model .get_document_topics (bow )} " )
120+ print (
121+ f"Document { self ._titles [i ]} belongs to topic: { self ._lda_model .get_document_topics (bow )} "
122+ )
94123
95124 def format_topics_sentences (self ):
96125 self .build_lda_model ()
@@ -107,12 +136,24 @@ def format_topics_sentences(self):
107136 if j == 0 : # => dominant topic
108137 wp = self ._lda_model .show_topic (topic_num )
109138 topic_keywords = ", " .join ([word for word , prop in wp ])
110- new_row = pd .DataFrame ([[int (topic_num ), round (prop_topic , 4 ), topic_keywords ]],
111- columns = ["Dominant_Topic" , "Perc_Contribution" , "Topic_Keywords" ])
112- sent_topics_df = pd .concat ([sent_topics_df , new_row ], ignore_index = True )
139+ new_row = pd .DataFrame (
140+ [[int (topic_num ), round (prop_topic , 4 ), topic_keywords ]],
141+ columns = [
142+ "Dominant_Topic" ,
143+ "Perc_Contribution" ,
144+ "Topic_Keywords" ,
145+ ],
146+ )
147+ sent_topics_df = pd .concat (
148+ [sent_topics_df , new_row ], ignore_index = True
149+ )
113150 else :
114151 break
115- sent_topics_df .columns = ["Dominant_Topic" , "Perc_Contribution" , "Topic_Keywords" ]
152+ sent_topics_df .columns = [
153+ "Dominant_Topic" ,
154+ "Perc_Contribution" ,
155+ "Topic_Keywords" ,
156+ ]
116157
117158 # Add original text to the end of the output
118159 contents = pd .Series (self ._processed_docs )
@@ -136,7 +177,6 @@ def most_representative_docs(self):
136177
137178 return sent_topics_sorteddf_mallet
138179
139-
140180 def topics_per_document (self , start = 0 , end = 1 ):
141181 corpus_sel = self ._corpus [start :end ]
142182 dominant_topics = []
0 commit comments