-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path9. Bag of Words.py
56 lines (45 loc) · 1.64 KB
/
9. Bag of Words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
'''
Bag-of-Words -
We need a way to represent text data for machine learning algorithm and
the bag-of-words model helps us to achieve that task.
The bag-of-words model is simple to understand and implement.
It is a way of extracting features from the text for use in machine learning algorithms.
'''
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 20)
documents_txt = ['Hello, how are you!',
'Win money, win from home.',
'Call me now.',
'Hello, Call hello you tomorrow?']
# cleaning data
def cleaning(raw_review):
remove_tags = BeautifulSoup(raw_review, "html.parser").get_text()
letters = re.sub("[^a-zA-Z]", " ", remove_tags)
lower_case = letters.lower()
words = lower_case.split()
stopword = stopwords.words("english")
meaningful_words = [w for w in words if not w in stopword]
return (" ".join(meaningful_words))
# create a variable for the for loop results
documents = []
# Manual cleaning for data
for i in documents_txt:
documents.append(cleaning(i))
# comment below line to get data after applying cleaning method
documents = documents_txt
classifiers = [
CountVectorizer(),
TfidfVectorizer()
]
for clf in classifiers:
clf.fit(documents)
print(clf.get_feature_names())
doc_array = clf.transform(documents).toarray()
# print(doc_array)
frequency_matrix = pd.DataFrame(doc_array, index=documents, columns=clf.get_feature_names())
print(frequency_matrix)