-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSMS Spam Classification.py
149 lines (115 loc) · 5.58 KB
/
SMS Spam Classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# https://github.com/KunalArora/SpamDetection-NLP-UCIData/blob/master/SpamDetection-NLP.ipynb
# import modules
import pandas as pd
import numpy as np
import string
# Set ipython's max row display
from nltk.corpus import stopwords
pd.set_option('display.max_row', 1000)
# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)
# Making this setting to display full text in columns
pd.set_option('display.max_colwidth', -1)
# load data and view data
sms_dataset = pd.read_csv('./dataset/spam.csv', sep=',', encoding='Latin-1')
print(sms_dataset.shape)
#print(sms_dataset.head())
# Remove empty columns and give existing columns meaningful names.¶
# drop meaningless columns
sms_dataset = sms_dataset.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
# Give meaningful names for exist columns
sms_dataset.columns = ['label', 'texts']
print(sms_dataset.describe())
print(sms_dataset.shape)
#print(sms_dataset.head(5))
# group by to use describe by label, this way we can begin to think about the features that separate ham and spam!
# print(sms_dataset.groupby('label').describe())
# Text Pre-processing
def text_process(mess):
"""
Takes in a string of text, then performs the following:
1. Remove all punctuation
2. Remove all stopwords
3. Returns a list of the cleaned text
"""
# Check characters to see if they are in punctuation
no_punctuation = [char for char in mess if char not in string.punctuation]
# Join the characters again to form the string.
no_punctuation = ''.join(no_punctuation)
# Now just remove any stopwords
return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]
# creating character count feature as "char_count"
sms_dataset['char_count'] = sms_dataset.texts.apply(len)
#sms_dataset['texts'] = sms_dataset['texts'].apply(text_process)
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(sms_dataset['texts'])
# Print total number of vocab words
print(len(bow_transformer.vocabulary_))
# Let's take one text message and get its bag-of-words counts as a vector, putting to use our new bow_transformer:
text_4 = sms_dataset['texts'][3]
print('Text at index 4 {}'.format(text_4))
# Now let's see its vector representation:
bow_4 = bow_transformer.transform([text_4])
print(bow_4)
print(bow_4.shape)
'''
output of above shows, that there are seven unique words in message number 4 (after removing common stop words).
Two of them appear twice, the rest only once.
'''
# Let's go ahead and check and confirm which ones appear twice:
print(bow_transformer.get_feature_names()[3996])
print(bow_transformer.get_feature_names()[9445])
'''
Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of messages.
Let's go ahead and check out how the bag-of-words counts for the entire SMS corpus is a large, sparse matrix:
'''
texts_bow = bow_transformer.transform(sms_dataset['texts'])
print('Shape of Sparse Matrix: ', texts_bow.shape)
print('Amount of Non-Zero occurrences: ', texts_bow.nnz)
sparsity = (100.0 * texts_bow.nnz / (texts_bow.shape[0] * texts_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(texts_bow)
tfidf4 = tfidf_transformer.transform(bow_4)
print('tfidf4 is',tfidf4)
# We'll check what is the IDF (inverse document frequency) of the word "u" and of word "university"?
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])
# To transform the entire bag-of-words corpus into TF-IDF corpus at once:
texts_tfidf = tfidf_transformer.transform(texts_bow)
print(texts_tfidf.shape)
# Training a model
# With messages represented as vectors, we can finally train our spam/ham classifier. Now we can actually use almost any sort of classification algorithms.
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(texts_tfidf, sms_dataset['label'])
print('predicted:', spam_detect_model.predict(tfidf4)[0])
print('expected:', sms_dataset.label[3])
# Model Evaluation
#Now we want to determine how well our model will do overall on the entire dataset. Let's begin by getting all the predictions:
all_predictions = spam_detect_model.predict(texts_tfidf)
print(all_predictions)
from sklearn.metrics import classification_report
print (classification_report(sms_dataset['label'], all_predictions))
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(sms_dataset['texts'], sms_dataset['label'], test_size=0.2)
print(len(X_train), len(X_test), len(Y_train) + len(Y_test))
# Creating a Data Pipeline
'''Let's run our model again and then predict off the test set.
We will use SciKit Learn's pipeline capabilities to store a pipeline of workflow.
This will allow us to set up all the transformations that we will do to the data for future use.
'''
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('bow', CountVectorizer(analyzer=text_process)), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier
])
'''
Now we can directly pass message text data and the pipeline will do our pre-processing for us!
'''
pipeline.fit(X_train,Y_train)
predictions = pipeline.predict(X_test)
print(classification_report(predictions,Y_test))