Skip to content

Chapter 3 Only classifying data as ham and not spam #45

@mbk0073

Description

@mbk0073

library(tm)
library(ggplot2)

#tm is the text mining package of R
#ggplot is for visualization
#there are 2 sets of files for each type of mail and one will be used for training while other will be for testing

spam.path<-"data/spam/"
spam2.path<-"data/spam_2/"
easyham.path<-"data/easy_ham/"
easyham2.path<-"data/easy_ham_2/"
hardham.path<-"data/hard_ham//"
hardham2.path<-"data/hard_ham_2/"

get.msg<-function(path){
  print(path)
  connection<-file(path,open="rt", encoding="Latin1")
  
  text<-readLines(connection)
  #the message begins after a full line break
   
  t<-which(text=="")[1]+1
  print(length(text))
  print(t)
  msg<-text[seq(t, length(text))]
 #print(msg) 
 
  close(connection)
  return (paste(msg, collapse="\n"))
  
}

#tdm=term document matrix

get.tdm<-function(doc.vec){
  doc.corpus<-Corpus(VectorSource(doc.vec))
  control<-list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
  doc.dtm<-TermDocumentMatrix(doc.corpus, control)
  return (doc.dtm)
  
}



# create a vector of emails
#use apply function

spam.docs<-dir(spam.path)
#this returns a list of file names in the directory
spam.docs<-spam.docs[seq(1,length(spam.docs)-1)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]

all.spam<-sapply(spam.docs, function(p) get.msg(paste(spam.path,p, sep="")))

spam.tdm<-get.tdm(all.spam)

#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))

spam.matrix<- as.matrix(spam.tdm)
spam.counts<-rowSums(spam.matrix)
spam.df<-data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringAsFactors=FALSE)
names(spam.df)<-c("term", "frequency")
spam.df$frequency<-as.numeric(spam.df$frequency)
spam.occurence<-sapply(1:nrow(spam.matrix)
                       , function(i){
                          length(which(spam.matrix[i,]>0))/ncol(spam.matrix)
                       })
spam.density<-spam.df$frequency/sum(spam.df$frequency)
spam.df<-transform(spam.df, density=spam.density, occurence=spam.occurence)

head(spam.df[with(spam.df,order(-occurence)), ])
#constructuon of Ham dataset













easy_ham.docs<-dir(easyham.path)
#this returns a list of file names in the directory
easy_ham.docs<-easy_ham.docs[seq(1,500)]
#spam.docs<-spam.docs[which(spam.docs!="")]
#cmds file is a UNIX file which we dont need
#spam.docs<-spam.docs[!startsWith(spam.docs, "cmds")]

all.easy_ham<-sapply(easy_ham.docs, function(p) get.msg(paste(easyham.path,p, sep="")))

easy_ham.tdm<-get.tdm(all.easy_ham)


#use the command below for inspection
#head(all.spam)
#z<-TermDocumentMatrix(Corpus(VectorSource(all.spam)), list(stopwords=TRUE, removeNumbers=TRUE, removePunctuation=TRUE, minDocFreq=2))

easy_ham.matrix<- as.matrix(easy_ham.tdm)
easy_ham.counts<-rowSums(easy_ham.matrix)
easy_ham.df<-data.frame(cbind(names(easy_ham.counts), as.numeric(easy_ham.counts)), stringAsFactors=FALSE)
names(easy_ham.df)<-c("term", "frequency")
easy_ham.df$frequency<-as.numeric(easy_ham.df$frequency)
easy_ham.occurence<-sapply(1:nrow(easy_ham.matrix)
                       , function(i){
                         length(which(easy_ham.matrix[i,]>0))/ncol(easy_ham.matrix)
                       })
easy_ham.density<-easy_ham.df$frequency/sum(easy_ham.df$frequency)
easy_ham.df<-transform(easy_ham.df, density=easy_ham.density, occurence=easy_ham.occurence)
easy_ham.df$NA.<-NULL
head(easy_ham.df[with(easy_ham.df,order(-occurence)), ])


#Classification function

classify.email<-function(path, training.df, prior=0.5, c=1e-6){
  msg<-get.msg(path)
  msg.tdm<-get.tdm(msg)
  msg.freq<-rowSums(as.matrix(msg.tdm))
  #Find intersection of words
  msg.match<-intersect(names(msg.freq), training.df$term)
  if(length(msg.match)<1){
    return (prior*c^(length(msg.freq)))
    
  }
  else{
    match.probs<-training.df$occurence[match(msg.match, training.df$term)]
    return (prior*prod(match.probs) * c^(length(msg.freq)-length(msg.match)))
  }
}





hardham.docs<-dir(hardham.path)
hardham.docs<-hardham.docs[seq(1:length(hardham.docs))]

hardham.spamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path,p, sep=""), 
                                                                  training.df = easy_ham.df))

hardham.hamtest<-sapply(hardham.docs, function(p) classify.email(paste(hardham.path, p, sep=""), training.df = easy_ham.df))

hardham.res<-ifelse(hardham.spamtest>hardham.hamtest, TRUE, FALSE)
summary(hardham.res)

This code only returns false for all values

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions