|
1 | 1 | import streamlit as st
|
2 | 2 | import pickle
|
3 | 3 | import string
|
4 |
| -from nltk.corpus import stopwords |
5 | 4 | import nltk
|
| 5 | +from nltk.corpus import stopwords |
| 6 | +from nltk.tokenize import word_tokenize |
6 | 7 | from nltk.stem.porter import PorterStemmer
|
7 | 8 |
|
8 |
| -ps = PorterStemmer() |
| 9 | +# Ensure NLTK resources are downloaded |
| 10 | +nltk.download('punkt') |
| 11 | +nltk.download('stopwords') |
9 | 12 |
|
| 13 | +ps = PorterStemmer() |
10 | 14 |
|
11 | 15 | def transform_text(text):
|
12 | 16 | text = text.lower()
|
13 |
| - text = nltk.word_tokenize(text) |
| 17 | + text = word_tokenize(text) # Tokenize text |
14 | 18 |
|
15 |
| - y = [] |
16 |
| - for i in text: |
17 |
| - if i.isalnum(): |
18 |
| - y.append(i) |
| 19 | + # Remove non-alphanumeric tokens and punctuation, then remove stopwords |
| 20 | + text = [i for i in text if i.isalnum()] |
| 21 | + text = [i for i in text if i not in stopwords.words('english')] |
| 22 | + text = [i for i in text if i not in string.punctuation] |
19 | 23 |
|
20 |
| - text = y[:] |
21 |
| - y.clear() |
| 24 | + # Apply stemming |
| 25 | + text = [ps.stem(i) for i in text] |
22 | 26 |
|
23 |
| - for i in text: |
24 |
| - if i not in stopwords.words('english') and i not in string.punctuation: |
25 |
| - y.append(i) |
| 27 | + return " ".join(text) |
26 | 28 |
|
27 |
| - text = y[:] |
28 |
| - y.clear() |
29 |
| - |
30 |
| - for i in text: |
31 |
| - y.append(ps.stem(i)) |
32 |
| - |
33 |
| - return " ".join(y) |
34 |
| - |
35 |
| -tfidf = pickle.load(open('vectorizer.pkl','rb')) |
36 |
| -model = pickle.load(open('model.pkl','rb')) |
| 29 | +# Load pre-trained models |
| 30 | +tfidf = pickle.load(open('vectorizer.pkl', 'rb')) |
| 31 | +model = pickle.load(open('model.pkl', 'rb')) |
37 | 32 |
|
| 33 | +# Streamlit app |
38 | 34 | st.title("Email/SMS Spam Classifier")
|
39 | 35 |
|
40 | 36 | input_sms = st.text_area("Enter the message")
|
41 | 37 |
|
42 | 38 | if st.button('Predict'):
|
43 |
| - |
44 |
| - # 1. preprocess |
| 39 | + # 1. Preprocess the input |
45 | 40 | transformed_sms = transform_text(input_sms)
|
46 |
| - # 2. vectorize |
| 41 | + # 2. Vectorize the input |
47 | 42 | vector_input = tfidf.transform([transformed_sms])
|
48 |
| - # 3. predict |
| 43 | + # 3. Predict |
49 | 44 | result = model.predict(vector_input)[0]
|
50 |
| - # 4. Display |
| 45 | + # 4. Display the result |
51 | 46 | if result == 1:
|
52 | 47 | st.header("Spam")
|
53 | 48 | else:
|
|
0 commit comments