Skip to content

Commit bf969ca

Browse files
committed
Main Code
1 parent d934213 commit bf969ca

File tree

1 file changed

+250
-0
lines changed

1 file changed

+250
-0
lines changed

sum.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
import numpy as np
2+
import networkx as nx
3+
import nltk
4+
from nltk.corpus import stopwords
5+
from nltk.cluster.util import cosine_distance
6+
7+
STOP_WORDS = []
8+
try:
9+
STOP_WORDS = stopwords.words('english')
10+
if not STOP_WORDS:
11+
nltk.download('stopwords')
12+
except LookupError:
13+
nltk.download('stopwords')
14+
15+
class SummariserCosine:
16+
# Generate clean sentences
17+
def read_text(self, text):
18+
"""
19+
Read the passed in text (as string) and split each line,
20+
that ends with a period (.)
21+
Replace anything that isn't between
22+
a to z or A to Z with a space.
23+
24+
Parameters
25+
==========
26+
text:
27+
text string to split into individual sentences
28+
29+
Return
30+
======
31+
Returns a list of sentences split by spaces
32+
"""
33+
split_text = text.split(". ")
34+
35+
sentences = []
36+
37+
for sentence in split_text:
38+
sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
39+
40+
sentences.pop()
41+
42+
return sentences
43+
44+
def extract_vector(self, sentence, all_words, stop_words):
45+
"""
46+
Extract vectors from the sentences,
47+
skip or ignore stop words from the
48+
stock of stop words loaded from the nltk library.
49+
Vectors here mean frequency count of
50+
the words by index.
51+
52+
Parameters
53+
==========
54+
sentence:
55+
a text string representing a sentence
56+
all_words:
57+
a list of words including the words in the passed in sentence
58+
stop_words:
59+
list of stop words to ignore or not take into consideration
60+
61+
Return
62+
======
63+
A dictionary of words with frequency of occurence of it in
64+
the sentence (also called vector)
65+
"""
66+
extracted_vector = [0] * len(all_words)
67+
68+
# build the vector for the sentence
69+
for word in sentence:
70+
if word in stop_words:
71+
continue
72+
extracted_vector[all_words.index(word)] += 1
73+
74+
return extracted_vector
75+
76+
# Checking the similarity of the two sentences(adjacent)
77+
def sentence_similarity(self, first_sentence, second_sentence, stop_words=None):
78+
"""
79+
Check if two sentences are similar based
80+
on their vector similarity
81+
(similar number of frequenting words between them)
82+
83+
Parameters
84+
==========
85+
first_sentence:
86+
a text string representing a sentence
87+
second_sentence:
88+
another text string representing a sentence
89+
stop_words:
90+
list of stop words to ignore or not take into consideration
91+
92+
Return
93+
======
94+
An decimal representation of the similarity between the
95+
two sentences (frequency similarity)
96+
"""
97+
if stop_words is None:
98+
stop_words = []
99+
100+
first_sentence = [word.lower() for word in first_sentence]
101+
second_sentence = [word.lower() for word in second_sentence]
102+
103+
all_words = list(set(first_sentence + second_sentence))
104+
105+
first_vector = self.extract_vector(first_sentence, all_words, stop_words)
106+
second_vector = self.extract_vector(second_sentence, all_words, stop_words)
107+
108+
return 1 - cosine_distance(first_vector, second_vector)
109+
110+
# Similarity matrix
111+
def build_similarity_matrix(self, sentences, stop_words):
112+
"""
113+
Create an similarity matrix using the sentences,
114+
round-robbing across all the sentences.
115+
116+
So we know which sentences are similar
117+
to others.
118+
119+
Parameters
120+
==========
121+
first_sentence:
122+
a text string representing a sentence
123+
second_sentence:
124+
another text string representing a sentence
125+
stop_words:
126+
list of stop words to ignore or not take into consideration
127+
128+
Return
129+
======
130+
An decimal representation of the similarity between the
131+
two sentences (frequency similarity)
132+
"""
133+
# Create an empty similarity matrix
134+
similarity_matrix = np.zeros((len(sentences), len(sentences)))
135+
136+
for this_sentence_index, this_sentence in enumerate(sentences):
137+
for another_sentence_index, another_sentence in enumerate(sentences):
138+
if this_sentence == another_sentence:
139+
#ignore if both are same sentences
140+
continue
141+
similarity_matrix[this_sentence_index][another_sentence_index] = \
142+
self.sentence_similarity(this_sentence, another_sentence, stop_words)
143+
144+
return similarity_matrix
145+
146+
# Construct the summarised text from the ranked sentences
147+
def summarise_text(self, ranked_sentences, top_n_sentences):
148+
"""
149+
Rank all the similar sentences based on
150+
their similarities and then create the
151+
summarised text from the ranked sentences.
152+
153+
Parameters
154+
==========
155+
ranked_sentences:
156+
a list of sentences ranked by their score (descending order)
157+
top_n_sentences:
158+
number of sentences to consider from the top of the list
159+
160+
Return
161+
======
162+
A list of top n sentences ranked by their score (descending order)
163+
"""
164+
summarised_text = []
165+
166+
if top_n_sentences > len(ranked_sentences):
167+
top_n_sentences = len(ranked_sentences)
168+
169+
for index in range(top_n_sentences):
170+
summarised_text.append(" ".join(ranked_sentences[index][1]))
171+
172+
summarised_text = ". ".join(summarised_text)
173+
174+
return summarised_text
175+
176+
# Sort sentences to surface top ranked ones from the similarity matrix
177+
def sort_sentences_to_surface_top_ranked_sentences(self, scores, sentences):
178+
"""
179+
Sort the sentences to bring the
180+
top ranked sentences to the surface
181+
182+
Parameters
183+
==========
184+
scores:
185+
scores of each of the sentences in the list of sentences
186+
sentences:
187+
a list of sentences
188+
189+
Return
190+
======
191+
a sorted list of sentences based on their scores (highest to lowest)
192+
"""
193+
return sorted(((scores[index], sentence) \
194+
for index, sentence in enumerate(sentences)), reverse=True)
195+
196+
# Rank the sentences using networkx's pagerank() function
197+
def rank_sentences(self, sentence_similarity_martix):
198+
"""
199+
Using networkx's pagerank rank the sentences,
200+
generating a graph and scores for each sentence
201+
202+
Parameters
203+
==========
204+
sentence_similarity_martix:
205+
a matrix of sentence similarity (cross sentences)
206+
207+
Return
208+
======
209+
a sentence similarity graph and scores of the sentences in descending order
210+
"""
211+
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
212+
scores = nx.pagerank(sentence_similarity_graph)
213+
return sentence_similarity_graph, scores
214+
215+
# Generating Summary Method
216+
def generate_summary(self, text, top_n_sentences):
217+
"""
218+
Generate a summary by processing a text
219+
through various steps, returning the summarised text
220+
and a list of ranked sentences from which the
221+
summary was prepared
222+
223+
Parameters
224+
==========
225+
text:
226+
raw text to summarise, usually a long string of text made up of multiple sentences
227+
top_n_sentences:
228+
number of sentences to pick from the list of sentences to form the summary
229+
230+
Return
231+
======
232+
A list sentences that will form the summarised text (top n sentences)
233+
"""
234+
235+
# Step 1 - Read text and tokenize
236+
sentences = self.read_text(text)
237+
238+
# Step 2 - Generate Similary Martix across sentences
239+
sentence_similarity_martix = self.build_similarity_matrix(sentences, STOP_WORDS)
240+
241+
# Step 3 - Rank sentences in similarity martix
242+
sentence_similarity_graph, scores = self.rank_sentences(sentence_similarity_martix)
243+
244+
# Step 4 - Sort the rank and pick top sentences
245+
ranked_sentences = self.sort_sentences_to_surface_top_ranked_sentences(scores, sentences)
246+
247+
# Step 5 - Construct the summarised text
248+
summarised_text = self.summarise_text(ranked_sentences, top_n_sentences)
249+
250+
return summarised_text, ranked_sentences

0 commit comments

Comments
 (0)