diff options
Diffstat (limited to 'project.R')
| -rw-r--r-- | project.R | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/project.R b/project.R new file mode 100644 index 0000000..2320cb3 --- /dev/null +++ b/project.R @@ -0,0 +1,142 @@ +library(data.table) +library(lattice) +library(caret) +library(nnet) +data <- fread("2020.10.02.csv") +data1<-fread("2020.10.03.csv") +data=rbind(data,data1) +#selecting few rows + + +library(fastDummies) +library(ggplot2) +library(plotly) +library(GGally) +#finding missing values in each column +colSums(is.na(data)) +#each column missing values box plot comes here +#taking non missing rows alone +data_na_removed = na.omit(data) +#checking if missing values are gone +colSums(is.na(data_na_removed)) +#box plot of label column comes here +#checking unique values +unique(data_na_removed$label) + +# 1 - Benign 2 - Malicious 3 - Outlier + +data_na_removed$label = factor(data_na_removed$label, + levels = c("benign", "malicious", "outlier"), + labels = c(1, 2, 3)) +summary(data_na_removed$label) +data_na_removed +#summary(data_na_removed) + +#ggpairs(data_na_removed) + +#options(scipen = 999) + +#ggplot(na.omit(data), aes(x=label, colour = label, fill = label), stat = "count") + geom_bar() + +# ggtitle("Distibution of Labels in Dataset") + +# labs(y = "Number of Cases", x = "Type of Label") + + +#cor.test(data_na_removed$entropy, as.numeric(data_na_removed$label)) + +#data_na_removed$label = as.numeric(data_na_removed$label) + +#data_na_removed = data_na_removed[, -c(12, 13)] + + +#ggplot(data_na_removed, aes(x = entropy)) + geom_bar() + +# facet_wrap(~label) + + +#hist(data_na_removed$entropy, bins = 10) +data_na_removed$label=as.factor(data_na_removed$label) +training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE) +train_set=data_na_removed[training,] +test_set=data_na_removed[-training,] +head(train_set) +model=train(data=train_set,label~.,method="nnet",tuneGrid=expand.grid(.size=c(5), .decay=0.1),trControl=trainControl(method="none",seeds = 123),MaxNWts=100,maxit=100) +confusionMatrix(train_set$label,predict(model,data=train_set)) +test_set$test_pred=predict(model,newdata = test_set[,-15]) +confusionMatrix(test_set$label, test_set$test_pred) +aggregate() + + +####kmeans +install.packages("ClusterR") +install.packages("cluster") +library(ClusterR) +library(cluster) +data_na_removed +dendogram=hclust(dist(data,method="euclidean"),method="complete") +data1=data_na_removed[,-15] +data1 +kmeans1<- kmeans(data1, centers = 3) +cm=table(data_na_removed$label, kmeans1$cluster) +cm +confusionMatrix(cm) + +###cart +data_na_removed +data1=data_na_removed[,c(4,5,10,11,15)] +data1 +training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE) +train_set=data_na_removed[training,] +test_set=data_na_removed[-training,] +model=train(data=train_set,label~.,method="rpart") +confusionMatrix(predict(model,new_data=test_set),train_set$label) + +##doing data cleaning + +data_na_removed=data_na_removed%>%mutate(timediff=time_end-time_start) +data_na_removed$time_end<-NULL +data_na_removed$time_start<-NULL +#SELECTING ONLY POSITIVES +data_na_removed=data_na_removed[data_na_removed$timediff>=1] +data_na_removed + +data_na_removed=data_na_removed[data_na_removed$dest_ip%in% c(786 , 15169 ,202425 , 61337 , 49453 ,45899 , 7713 , 16276 , 49505, + 57172 , 43350)] +data_na_removed$dest_ip=as.factor(data_na_removed$dest_ip) +summary(data_na_removed$dest_ip) +data_na_removed=data_na_removed[data_na_removed$dest_port %in% c(445,9200,22,5900,5060,53 , 5060 , 23 , 123 , 33522, 33524 , + 33518, 33504 , 33520,33524 , 33518, 33504, 33520, + 33526 , 3389 , 33514, 33512 , 60490 , 60506, 60512 , 60510)] +data_na_removed$dest_port=as.factor(data_na_removed$dest_port ) +summary(data_na_removed$dest_port) +data_na_removed=data_na_removed[data_na_removed$src_ip %in% c(786 , 45899 ,202425 , 7552 , 7713 , 49453 , 8048 , 18403 , 16276 , 43350 ,213371 , + 4134 , 34665,12389 , 200019 , 57172, 9299 , 12876, 8452 , 3462, + 25019 , 24961 , 55836 , 45820 , 8151 , 45090,45595 , 9498 , 45903, 47331 , 4812 , 9121 , + 6503 , 9484 , 4837 , 8376 , 15895, 9009 , 6057 )] + +data_na_removed$src_ip=as.factor(data_na_removed$src_ip) +summary(data_na_removed$src_ip) +data_na_removed=data_na_removed[data_na_removed$src_port %in% c(9200 , 33504 , 33524 , 33518 , 33514 , 33522 , 60510, + 33512, 60516 , 33526 , 60490 , 33520 , 60512 ,60506 , 60514 , 60518 , 60508, 55336 , 55330, 55332 , + 55334 , 123, 53278 , 53020 , 32651 , 26042)] + + +data_na_removed$src_port=as.factor(data_na_removed$src_port) +summary(data_na_removed$src_port) +data_na_removed +dmy <- dummyVars(" ~dest_ip+dest_port+src_ip+src_port", data = data_na_removed) +trsf <- data.frame(predict(dmy, newdata = data_na_removed)) +data_na_removed=cbind(data_na_removed,trsf) +data_na_removed=data_na_removed[,c(-4,-5,-10,-11)] +data_na_removed$timediff<-NULL +data_na_removed +data_na_removed$avg_ipt=scale(data_na_removed$avg_ipt) +data_na_removed$bytes_in=scale(data_na_removed$bytes_in) +data_na_removed$bytes_out=scale(data_na_removed$bytes_out) +data_na_removed$entropy=scale(data_na_removed$entropy) +data_na_removed$num_pkts_out=scale(data_na_removed$num_pkts_out) +data_na_removed$proto=scale(data_na_removed$proto) +data_na_removed$total_entropy=scale(data_na_removed$total_entropy) +data_na_removed$duration=scale(data_na_removed$duration) +summary(data_na_removed) + + + |
