|
| 1 | +import numpy as np |
| 2 | +import networkx as nx |
| 3 | +import nltk |
| 4 | +from nltk.corpus import stopwords |
| 5 | +from nltk.cluster.util import cosine_distance |
| 6 | + |
| 7 | +STOP_WORDS = [] |
| 8 | +try: |
| 9 | + STOP_WORDS = stopwords.words('english') |
| 10 | + if not STOP_WORDS: |
| 11 | + nltk.download('stopwords') |
| 12 | +except LookupError: |
| 13 | + nltk.download('stopwords') |
| 14 | + |
| 15 | +class SummariserCosine: |
| 16 | + # Generate clean sentences |
| 17 | + def read_text(self, text): |
| 18 | + """ |
| 19 | + Read the passed in text (as string) and split each line, |
| 20 | + that ends with a period (.) |
| 21 | + Replace anything that isn't between |
| 22 | + a to z or A to Z with a space. |
| 23 | +
|
| 24 | + Parameters |
| 25 | + ========== |
| 26 | + text: |
| 27 | + text string to split into individual sentences |
| 28 | +
|
| 29 | + Return |
| 30 | + ====== |
| 31 | + Returns a list of sentences split by spaces |
| 32 | + """ |
| 33 | + split_text = text.split(". ") |
| 34 | + |
| 35 | + sentences = [] |
| 36 | + |
| 37 | + for sentence in split_text: |
| 38 | + sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" ")) |
| 39 | + |
| 40 | + sentences.pop() |
| 41 | + |
| 42 | + return sentences |
| 43 | + |
| 44 | + def extract_vector(self, sentence, all_words, stop_words): |
| 45 | + """ |
| 46 | + Extract vectors from the sentences, |
| 47 | + skip or ignore stop words from the |
| 48 | + stock of stop words loaded from the nltk library. |
| 49 | + Vectors here mean frequency count of |
| 50 | + the words by index. |
| 51 | +
|
| 52 | + Parameters |
| 53 | + ========== |
| 54 | + sentence: |
| 55 | + a text string representing a sentence |
| 56 | + all_words: |
| 57 | + a list of words including the words in the passed in sentence |
| 58 | + stop_words: |
| 59 | + list of stop words to ignore or not take into consideration |
| 60 | +
|
| 61 | + Return |
| 62 | + ====== |
| 63 | + A dictionary of words with frequency of occurence of it in |
| 64 | + the sentence (also called vector) |
| 65 | + """ |
| 66 | + extracted_vector = [0] * len(all_words) |
| 67 | + |
| 68 | + # build the vector for the sentence |
| 69 | + for word in sentence: |
| 70 | + if word in stop_words: |
| 71 | + continue |
| 72 | + extracted_vector[all_words.index(word)] += 1 |
| 73 | + |
| 74 | + return extracted_vector |
| 75 | + |
| 76 | + # Checking the similarity of the two sentences(adjacent) |
| 77 | + def sentence_similarity(self, first_sentence, second_sentence, stop_words=None): |
| 78 | + """ |
| 79 | + Check if two sentences are similar based |
| 80 | + on their vector similarity |
| 81 | + (similar number of frequenting words between them) |
| 82 | +
|
| 83 | + Parameters |
| 84 | + ========== |
| 85 | + first_sentence: |
| 86 | + a text string representing a sentence |
| 87 | + second_sentence: |
| 88 | + another text string representing a sentence |
| 89 | + stop_words: |
| 90 | + list of stop words to ignore or not take into consideration |
| 91 | +
|
| 92 | + Return |
| 93 | + ====== |
| 94 | + An decimal representation of the similarity between the |
| 95 | + two sentences (frequency similarity) |
| 96 | + """ |
| 97 | + if stop_words is None: |
| 98 | + stop_words = [] |
| 99 | + |
| 100 | + first_sentence = [word.lower() for word in first_sentence] |
| 101 | + second_sentence = [word.lower() for word in second_sentence] |
| 102 | + |
| 103 | + all_words = list(set(first_sentence + second_sentence)) |
| 104 | + |
| 105 | + first_vector = self.extract_vector(first_sentence, all_words, stop_words) |
| 106 | + second_vector = self.extract_vector(second_sentence, all_words, stop_words) |
| 107 | + |
| 108 | + return 1 - cosine_distance(first_vector, second_vector) |
| 109 | + |
| 110 | + # Similarity matrix |
| 111 | + def build_similarity_matrix(self, sentences, stop_words): |
| 112 | + """ |
| 113 | + Create an similarity matrix using the sentences, |
| 114 | + round-robbing across all the sentences. |
| 115 | +
|
| 116 | + So we know which sentences are similar |
| 117 | + to others. |
| 118 | +
|
| 119 | + Parameters |
| 120 | + ========== |
| 121 | + first_sentence: |
| 122 | + a text string representing a sentence |
| 123 | + second_sentence: |
| 124 | + another text string representing a sentence |
| 125 | + stop_words: |
| 126 | + list of stop words to ignore or not take into consideration |
| 127 | +
|
| 128 | + Return |
| 129 | + ====== |
| 130 | + An decimal representation of the similarity between the |
| 131 | + two sentences (frequency similarity) |
| 132 | + """ |
| 133 | + # Create an empty similarity matrix |
| 134 | + similarity_matrix = np.zeros((len(sentences), len(sentences))) |
| 135 | + |
| 136 | + for this_sentence_index, this_sentence in enumerate(sentences): |
| 137 | + for another_sentence_index, another_sentence in enumerate(sentences): |
| 138 | + if this_sentence == another_sentence: |
| 139 | + #ignore if both are same sentences |
| 140 | + continue |
| 141 | + similarity_matrix[this_sentence_index][another_sentence_index] = \ |
| 142 | + self.sentence_similarity(this_sentence, another_sentence, stop_words) |
| 143 | + |
| 144 | + return similarity_matrix |
| 145 | + |
| 146 | + # Construct the summarised text from the ranked sentences |
| 147 | + def summarise_text(self, ranked_sentences, top_n_sentences): |
| 148 | + """ |
| 149 | + Rank all the similar sentences based on |
| 150 | + their similarities and then create the |
| 151 | + summarised text from the ranked sentences. |
| 152 | +
|
| 153 | + Parameters |
| 154 | + ========== |
| 155 | + ranked_sentences: |
| 156 | + a list of sentences ranked by their score (descending order) |
| 157 | + top_n_sentences: |
| 158 | + number of sentences to consider from the top of the list |
| 159 | +
|
| 160 | + Return |
| 161 | + ====== |
| 162 | + A list of top n sentences ranked by their score (descending order) |
| 163 | + """ |
| 164 | + summarised_text = [] |
| 165 | + |
| 166 | + if top_n_sentences > len(ranked_sentences): |
| 167 | + top_n_sentences = len(ranked_sentences) |
| 168 | + |
| 169 | + for index in range(top_n_sentences): |
| 170 | + summarised_text.append(" ".join(ranked_sentences[index][1])) |
| 171 | + |
| 172 | + summarised_text = ". ".join(summarised_text) |
| 173 | + |
| 174 | + return summarised_text |
| 175 | + |
| 176 | + # Sort sentences to surface top ranked ones from the similarity matrix |
| 177 | + def sort_sentences_to_surface_top_ranked_sentences(self, scores, sentences): |
| 178 | + """ |
| 179 | + Sort the sentences to bring the |
| 180 | + top ranked sentences to the surface |
| 181 | +
|
| 182 | + Parameters |
| 183 | + ========== |
| 184 | + scores: |
| 185 | + scores of each of the sentences in the list of sentences |
| 186 | + sentences: |
| 187 | + a list of sentences |
| 188 | +
|
| 189 | + Return |
| 190 | + ====== |
| 191 | + a sorted list of sentences based on their scores (highest to lowest) |
| 192 | + """ |
| 193 | + return sorted(((scores[index], sentence) \ |
| 194 | + for index, sentence in enumerate(sentences)), reverse=True) |
| 195 | + |
| 196 | + # Rank the sentences using networkx's pagerank() function |
| 197 | + def rank_sentences(self, sentence_similarity_martix): |
| 198 | + """ |
| 199 | + Using networkx's pagerank rank the sentences, |
| 200 | + generating a graph and scores for each sentence |
| 201 | +
|
| 202 | + Parameters |
| 203 | + ========== |
| 204 | + sentence_similarity_martix: |
| 205 | + a matrix of sentence similarity (cross sentences) |
| 206 | +
|
| 207 | + Return |
| 208 | + ====== |
| 209 | + a sentence similarity graph and scores of the sentences in descending order |
| 210 | + """ |
| 211 | + sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) |
| 212 | + scores = nx.pagerank(sentence_similarity_graph) |
| 213 | + return sentence_similarity_graph, scores |
| 214 | + |
| 215 | + # Generating Summary Method |
| 216 | + def generate_summary(self, text, top_n_sentences): |
| 217 | + """ |
| 218 | + Generate a summary by processing a text |
| 219 | + through various steps, returning the summarised text |
| 220 | + and a list of ranked sentences from which the |
| 221 | + summary was prepared |
| 222 | +
|
| 223 | + Parameters |
| 224 | + ========== |
| 225 | + text: |
| 226 | + raw text to summarise, usually a long string of text made up of multiple sentences |
| 227 | + top_n_sentences: |
| 228 | + number of sentences to pick from the list of sentences to form the summary |
| 229 | +
|
| 230 | + Return |
| 231 | + ====== |
| 232 | + A list sentences that will form the summarised text (top n sentences) |
| 233 | + """ |
| 234 | + |
| 235 | + # Step 1 - Read text and tokenize |
| 236 | + sentences = self.read_text(text) |
| 237 | + |
| 238 | + # Step 2 - Generate Similary Martix across sentences |
| 239 | + sentence_similarity_martix = self.build_similarity_matrix(sentences, STOP_WORDS) |
| 240 | + |
| 241 | + # Step 3 - Rank sentences in similarity martix |
| 242 | + sentence_similarity_graph, scores = self.rank_sentences(sentence_similarity_martix) |
| 243 | + |
| 244 | + # Step 4 - Sort the rank and pick top sentences |
| 245 | + ranked_sentences = self.sort_sentences_to_surface_top_ranked_sentences(scores, sentences) |
| 246 | + |
| 247 | + # Step 5 - Construct the summarised text |
| 248 | + summarised_text = self.summarise_text(ranked_sentences, top_n_sentences) |
| 249 | + |
| 250 | + return summarised_text, ranked_sentences |
0 commit comments