-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Open
Description
library(tm)
library(ggplot2)
#tm is the text mining package of R
#ggplot is for visualization
#there are 2 sets of files for each type of mail and one will be used for training while other will be for testing
spam.path<-"data/spam/"
spam2.path<-"data/spam_2/"
easyham.path<-"data/easy_ham/"
easyham2.path<-"data/easy_ham_2/"
hardham.path<-"data/hard_ham//"
hardham2.path<-"data/hard_ham_2/"
get.msg<-function(path){
print(path)
connection<-file(path,open="rt", encoding="Latin1")
text<-readLines(connection)
#the message begins after a full line break
t<-which(text=="")[1]+1
print(length(text))
print(t)
msg<-text[seq(t, length(text))]
#print(msg)
close(connection)
return (paste(msg, collapse="\n"))
}
#tdm=term document matrix
get.tdm<-function(doc.vec){
doc.corpus<-Corpus(VectorSource(doc.vec))
control<-list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
doc.dtm<-TermDocumentMatrix(doc.corpus, control)
return (doc.dtm)
}
# create a vector of emails
#use apply function
spam.docs<-dir(spam.path)
#this returns a list of file names in the directory
spam.docs<-spam.docs[seq(1,length(spam.docs)-1)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]
all.spam<-sapply(spam.docs, function(p) get.msg(paste(spam.path,p, sep="")))
spam.tdm<-get.tdm(all.spam)
#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))
spam.matrix<- as.matrix(spam.tdm)
spam.counts<-rowSums(spam.matrix)
spam.df<-data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringAsFactors=FALSE)
names(spam.df)<-c("term", "frequency")
spam.df$frequency<-as.numeric(spam.df$frequency)
spam.occurence<-sapply(1:nrow(spam.matrix)
, function(i){
length(which(spam.matrix[i,]>0))/ncol(spam.matrix)
})
spam.density<-spam.df$frequency/sum(spam.df$frequency)
spam.df<-transform(spam.df, density=spam.density, occurence=spam.occurence)
head(spam.df[with(spam.df,order(-occurence)), ])
#constructuon of Ham dataset
easy_ham.docs<-dir(easyham.path)
#this returns a list of file names in the directory
easy_ham.docs<-easy_ham.docs[seq(1,500)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]
all.easy_ham<-sapply(easy_ham.docs, function(p) get.msg(paste(easyham.path,p, sep="")))
easy_ham.tdm<-get.tdm(all.easy_ham)
#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))
easy_ham.matrix<- as.matrix(easy_ham.tdm)
easy_ham.counts<-rowSums(easy_ham.matrix)
easy_ham.df<-data.frame(cbind(names(easy_ham.counts), as.numeric(easy_ham.counts)), stringAsFactors=FALSE)
names(easy_ham.df)<-c("term", "frequency")
easy_ham.df$frequency<-as.numeric(easy_ham.df$frequency)
easy_ham.occurence<-sapply(1:nrow(easy_ham.matrix)
, function(i){
length(which(easy_ham.matrix[i,]>0))/ncol(easy_ham.matrix)
})
easy_ham.density<-easy_ham.df$frequency/sum(easy_ham.df$frequency)
easy_ham.df<-transform(easy_ham.df, density=easy_ham.density, occurence=easy_ham.occurence)
easy_ham.df$NA.<-NULL
head(easy_ham.df[with(easy_ham.df,order(-occurence)), ])
#Classification function
classify.email<-function(path, training.df, prior=0.5, c=1e-6){
msg<-get.msg(path)
msg.tdm<-get.tdm(msg)
msg.freq<-rowSums(as.matrix(msg.tdm))
#Find intersection of words
msg.match<-intersect(names(msg.freq), training.df$term)
if(length(msg.match)<1){
return (prior*c^(length(msg.freq)))
}
else{
match.probs<-training.df$occurence[match(msg.match, training.df$term)]
return (prior*prod(match.probs) * c^(length(msg.freq)-length(msg.match)))
}
}
hardham.docs<-dir(hardham.path)
hardham.docs<-hardham.docs[seq(1:length(hardham.docs))]
hardham.spamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path,p, sep=""),
training.df = easy_ham.df))
hardham.hamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path, p, sep=""), training.df = easy_ham.df))
hardham.res<-ifelse(hardham.spamtest>hardham.hamtest, TRUE, FALSE)
summary(hardham.res)
This code only returns false for all values
Metadata
Metadata
Assignees
Labels
No labels