library(data.table)
library(lattice)
library(caret)
library(nnet)
data <- fread("2020.10.02.csv")
data1<-fread("2020.10.03.csv")
data=rbind(data,data1)
#selecting few rows


library(fastDummies)
library(ggplot2)
library(plotly)
library(GGally)
#finding missing values in each column
colSums(is.na(data))
#each column missing values box plot comes here
#taking non missing rows alone
data_na_removed = na.omit(data)
#checking if missing values are gone
colSums(is.na(data_na_removed))
#box plot of label column comes here
#checking unique values
unique(data_na_removed$label)

# 1 - Benign 2 - Malicious 3 - Outlier

data_na_removed$label = factor(data_na_removed$label,
                               levels = c("benign", "malicious", "outlier"),
                               labels = c(1, 2, 3))
summary(data_na_removed$label)
data_na_removed
#summary(data_na_removed)

#ggpairs(data_na_removed)

#options(scipen = 999)

#ggplot(na.omit(data), aes(x=label, colour = label, fill = label), stat = "count") + geom_bar() +
#  ggtitle("Distibution of Labels in Dataset") + 
#  labs(y = "Number of Cases", x = "Type of Label")


#cor.test(data_na_removed$entropy, as.numeric(data_na_removed$label))

#data_na_removed$label = as.numeric(data_na_removed$label)

#data_na_removed = data_na_removed[, -c(12, 13)]


#ggplot(data_na_removed, aes(x = entropy)) + geom_bar() + 
#  facet_wrap(~label)


#hist(data_na_removed$entropy, bins = 10)
data_na_removed$label=as.factor(data_na_removed$label)
training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
train_set=data_na_removed[training,]
test_set=data_na_removed[-training,]
head(train_set)
model=train(data=train_set,label~.,method="nnet",tuneGrid=expand.grid(.size=c(5), .decay=0.1),trControl=trainControl(method="none",seeds = 123),MaxNWts=100,maxit=100)
confusionMatrix(train_set$label,predict(model,data=train_set))
test_set$test_pred=predict(model,newdata = test_set[,-15])
confusionMatrix(test_set$label, test_set$test_pred)
aggregate()


####kmeans
install.packages("ClusterR")
install.packages("cluster")
library(ClusterR)
library(cluster)
data_na_removed
dendogram=hclust(dist(data,method="euclidean"),method="complete")
data1=data_na_removed[,-15]
data1
kmeans1<- kmeans(data1, centers = 3)
cm=table(data_na_removed$label, kmeans1$cluster)
cm
confusionMatrix(cm)

###cart
data_na_removed
data1=data_na_removed[,c(4,5,10,11,15)]
data1
training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
train_set=data_na_removed[training,]
test_set=data_na_removed[-training,]
model=train(data=train_set,label~.,method="rpart")
confusionMatrix(predict(model,new_data=test_set),train_set$label)

##doing data cleaning

data_na_removed=data_na_removed%>%mutate(timediff=time_end-time_start)
data_na_removed$time_end<-NULL
data_na_removed$time_start<-NULL
#SELECTING ONLY POSITIVES
data_na_removed=data_na_removed[data_na_removed$timediff>=1]
data_na_removed

data_na_removed=data_na_removed[data_na_removed$dest_ip%in% c(786  , 15169  ,202425  , 61337  , 49453   ,45899  ,  7713  , 16276 ,  49505,
                                                              57172  , 43350)]
data_na_removed$dest_ip=as.factor(data_na_removed$dest_ip)
summary(data_na_removed$dest_ip)
data_na_removed=data_na_removed[data_na_removed$dest_port %in% c(445,9200,22,5900,5060,53  ,  5060 ,     23 ,    123 ,  33522,   33524 ,
                                                                 33518,   33504 ,  33520,33524 ,  33518,   33504,   33520,
                                                                 33526  ,  3389 ,  33514,   33512 ,  60490 ,  60506,   60512 ,  60510)]
data_na_removed$dest_port=as.factor(data_na_removed$dest_port )
summary(data_na_removed$dest_port)
data_na_removed=data_na_removed[data_na_removed$src_ip %in% c(786  , 45899  ,202425  ,  7552 ,   7713 ,  49453   , 8048  , 18403 ,  16276 ,  43350  ,213371 ,
                                                              4134 ,  34665,12389 , 200019 ,  57172,    9299 ,  12876,    8452  ,  3462,
                                                              25019 ,  24961 ,  55836 ,  45820 ,   8151  , 45090,45595 ,   9498  , 45903,   47331  ,  4812 ,   9121 ,
                                                              6503 ,   9484  ,  4837 ,   8376  , 15895,    9009  ,  6057 )]

data_na_removed$src_ip=as.factor(data_na_removed$src_ip)
summary(data_na_removed$src_ip)
data_na_removed=data_na_removed[data_na_removed$src_port %in% c(9200 ,  33504  , 33524 ,  33518 ,  33514 ,  33522 ,  60510,
                                                                33512,   60516 ,  33526 ,  60490  , 33520  , 60512 ,60506 ,  60514 ,  60518 ,  60508,   55336  , 55330,   55332 ,
                                                                55334 ,    123,   53278  , 53020 ,  32651 ,  26042)]


data_na_removed$src_port=as.factor(data_na_removed$src_port)
summary(data_na_removed$src_port)
data_na_removed
dmy <- dummyVars(" ~dest_ip+dest_port+src_ip+src_port", data = data_na_removed)
trsf <- data.frame(predict(dmy, newdata = data_na_removed))
data_na_removed=cbind(data_na_removed,trsf)
data_na_removed=data_na_removed[,c(-4,-5,-10,-11)]
data_na_removed$timediff<-NULL
data_na_removed
data_na_removed$avg_ipt=scale(data_na_removed$avg_ipt)
data_na_removed$bytes_in=scale(data_na_removed$bytes_in)
data_na_removed$bytes_out=scale(data_na_removed$bytes_out)
data_na_removed$entropy=scale(data_na_removed$entropy)
data_na_removed$num_pkts_out=scale(data_na_removed$num_pkts_out)
data_na_removed$proto=scale(data_na_removed$proto)
data_na_removed$total_entropy=scale(data_na_removed$total_entropy)
data_na_removed$duration=scale(data_na_removed$duration)
summary(data_na_removed)