aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md36
-rw-r--r--project.R142
2 files changed, 36 insertions, 142 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..cf56ae5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,36 @@
+# Network Intrusion Detection
+A Comprehensive Approach To Analysis and Detection of Emerging Threats due to Network Intrusion
+
+## Required Tools
+
+Some tools are required to run the project.
+
+- [RStudio](https://www.rstudio.com/)
+- [WGet](https://www.gnu.org/software/wget/)
+
+## Downloading the Dataset
+
+To download the dataset, use the [`dataset_downloader.sh`](dataset_downloader.sh) script on UNIX, Linux, or MacOS.
+
+```bash
+$ chmod +x dataset_downloader.sh
+$ ./dataset_downloader.sh
+```
+
+To download the dataset, use the [`dataset_downloader.bat`](dataset_downloader.bat) script on Windows.
+
+## Starting the Project
+
+To start the project, you need to build the models in RStudio. Run the [models.R](models.R) script in RStudio.
+
+There are 4 models to build:
+- Deep Learning Model
+- Distributed Random Forest Model
+- Gradient Boosting Machine Model
+- Naiive Bayes Model
+
+You can add more models to the project by adding them to the [models.R](models.R) script and importing them in the [app.R](app.R) script.
+
+In order to run the [R Shiny App](https://shiny.rstudio.com/), you need to build the project in RStudio. Run the [app.R](app.R) script in RStudio.
+
+
diff --git a/project.R b/project.R
deleted file mode 100644
index 2320cb3..0000000
--- a/project.R
+++ /dev/null
@@ -1,142 +0,0 @@
-library(data.table)
-library(lattice)
-library(caret)
-library(nnet)
-data <- fread("2020.10.02.csv")
-data1<-fread("2020.10.03.csv")
-data=rbind(data,data1)
-#selecting few rows
-
-
-library(fastDummies)
-library(ggplot2)
-library(plotly)
-library(GGally)
-#finding missing values in each column
-colSums(is.na(data))
-#each column missing values box plot comes here
-#taking non missing rows alone
-data_na_removed = na.omit(data)
-#checking if missing values are gone
-colSums(is.na(data_na_removed))
-#box plot of label column comes here
-#checking unique values
-unique(data_na_removed$label)
-
-# 1 - Benign 2 - Malicious 3 - Outlier
-
-data_na_removed$label = factor(data_na_removed$label,
- levels = c("benign", "malicious", "outlier"),
- labels = c(1, 2, 3))
-summary(data_na_removed$label)
-data_na_removed
-#summary(data_na_removed)
-
-#ggpairs(data_na_removed)
-
-#options(scipen = 999)
-
-#ggplot(na.omit(data), aes(x=label, colour = label, fill = label), stat = "count") + geom_bar() +
-# ggtitle("Distibution of Labels in Dataset") +
-# labs(y = "Number of Cases", x = "Type of Label")
-
-
-#cor.test(data_na_removed$entropy, as.numeric(data_na_removed$label))
-
-#data_na_removed$label = as.numeric(data_na_removed$label)
-
-#data_na_removed = data_na_removed[, -c(12, 13)]
-
-
-#ggplot(data_na_removed, aes(x = entropy)) + geom_bar() +
-# facet_wrap(~label)
-
-
-#hist(data_na_removed$entropy, bins = 10)
-data_na_removed$label=as.factor(data_na_removed$label)
-training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
-train_set=data_na_removed[training,]
-test_set=data_na_removed[-training,]
-head(train_set)
-model=train(data=train_set,label~.,method="nnet",tuneGrid=expand.grid(.size=c(5), .decay=0.1),trControl=trainControl(method="none",seeds = 123),MaxNWts=100,maxit=100)
-confusionMatrix(train_set$label,predict(model,data=train_set))
-test_set$test_pred=predict(model,newdata = test_set[,-15])
-confusionMatrix(test_set$label, test_set$test_pred)
-aggregate()
-
-
-####kmeans
-install.packages("ClusterR")
-install.packages("cluster")
-library(ClusterR)
-library(cluster)
-data_na_removed
-dendogram=hclust(dist(data,method="euclidean"),method="complete")
-data1=data_na_removed[,-15]
-data1
-kmeans1<- kmeans(data1, centers = 3)
-cm=table(data_na_removed$label, kmeans1$cluster)
-cm
-confusionMatrix(cm)
-
-###cart
-data_na_removed
-data1=data_na_removed[,c(4,5,10,11,15)]
-data1
-training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
-train_set=data_na_removed[training,]
-test_set=data_na_removed[-training,]
-model=train(data=train_set,label~.,method="rpart")
-confusionMatrix(predict(model,new_data=test_set),train_set$label)
-
-##doing data cleaning
-
-data_na_removed=data_na_removed%>%mutate(timediff=time_end-time_start)
-data_na_removed$time_end<-NULL
-data_na_removed$time_start<-NULL
-#SELECTING ONLY POSITIVES
-data_na_removed=data_na_removed[data_na_removed$timediff>=1]
-data_na_removed
-
-data_na_removed=data_na_removed[data_na_removed$dest_ip%in% c(786 , 15169 ,202425 , 61337 , 49453 ,45899 , 7713 , 16276 , 49505,
- 57172 , 43350)]
-data_na_removed$dest_ip=as.factor(data_na_removed$dest_ip)
-summary(data_na_removed$dest_ip)
-data_na_removed=data_na_removed[data_na_removed$dest_port %in% c(445,9200,22,5900,5060,53 , 5060 , 23 , 123 , 33522, 33524 ,
- 33518, 33504 , 33520,33524 , 33518, 33504, 33520,
- 33526 , 3389 , 33514, 33512 , 60490 , 60506, 60512 , 60510)]
-data_na_removed$dest_port=as.factor(data_na_removed$dest_port )
-summary(data_na_removed$dest_port)
-data_na_removed=data_na_removed[data_na_removed$src_ip %in% c(786 , 45899 ,202425 , 7552 , 7713 , 49453 , 8048 , 18403 , 16276 , 43350 ,213371 ,
- 4134 , 34665,12389 , 200019 , 57172, 9299 , 12876, 8452 , 3462,
- 25019 , 24961 , 55836 , 45820 , 8151 , 45090,45595 , 9498 , 45903, 47331 , 4812 , 9121 ,
- 6503 , 9484 , 4837 , 8376 , 15895, 9009 , 6057 )]
-
-data_na_removed$src_ip=as.factor(data_na_removed$src_ip)
-summary(data_na_removed$src_ip)
-data_na_removed=data_na_removed[data_na_removed$src_port %in% c(9200 , 33504 , 33524 , 33518 , 33514 , 33522 , 60510,
- 33512, 60516 , 33526 , 60490 , 33520 , 60512 ,60506 , 60514 , 60518 , 60508, 55336 , 55330, 55332 ,
- 55334 , 123, 53278 , 53020 , 32651 , 26042)]
-
-
-data_na_removed$src_port=as.factor(data_na_removed$src_port)
-summary(data_na_removed$src_port)
-data_na_removed
-dmy <- dummyVars(" ~dest_ip+dest_port+src_ip+src_port", data = data_na_removed)
-trsf <- data.frame(predict(dmy, newdata = data_na_removed))
-data_na_removed=cbind(data_na_removed,trsf)
-data_na_removed=data_na_removed[,c(-4,-5,-10,-11)]
-data_na_removed$timediff<-NULL
-data_na_removed
-data_na_removed$avg_ipt=scale(data_na_removed$avg_ipt)
-data_na_removed$bytes_in=scale(data_na_removed$bytes_in)
-data_na_removed$bytes_out=scale(data_na_removed$bytes_out)
-data_na_removed$entropy=scale(data_na_removed$entropy)
-data_na_removed$num_pkts_out=scale(data_na_removed$num_pkts_out)
-data_na_removed$proto=scale(data_na_removed$proto)
-data_na_removed$total_entropy=scale(data_na_removed$total_entropy)
-data_na_removed$duration=scale(data_na_removed$duration)
-summary(data_na_removed)
-
-
-