2 files changed, 36 insertions, 142 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..cf56ae5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,36 @@
+# Network Intrusion Detection
+A Comprehensive Approach To Analysis and Detection of Emerging Threats due to Network Intrusion
+
+## Required Tools
+
+Some tools are required to run the project.
+
+- [RStudio](https://www.rstudio.com/)
+- [WGet](https://www.gnu.org/software/wget/)
+
+## Downloading the Dataset
+
+To download the dataset, use the [`dataset_downloader.sh`](dataset_downloader.sh) script on UNIX, Linux, or MacOS.
+
+```bash
+$ chmod +x dataset_downloader.sh
+$ ./dataset_downloader.sh
+```
+
+To download the dataset, use the [`dataset_downloader.bat`](dataset_downloader.bat) script on Windows.
+
+## Starting the Project
+
+To start the project, you need to build the models in RStudio. Run the [models.R](models.R) script in RStudio.
+
+There are 4 models to build:
+- Deep Learning Model
+- Distributed Random Forest Model
+- Gradient Boosting Machine Model
+- Naiive Bayes Model
+
+You can add more models to the project by adding them to the [models.R](models.R) script and importing them in the [app.R](app.R) script.
+
+In order to run the [R Shiny App](https://shiny.rstudio.com/), you need to build the project in RStudio. Run the [app.R](app.R) script in RStudio.
+
+
diff --git a/project.R b/project.R
deleted file mode 100644
index 2320cb3..0000000
--- a/project.R
+++ /dev/null
@@ -1,142 +0,0 @@
-library(data.table)
-library(lattice)
-library(caret)
-library(nnet)
-data <- fread("2020.10.02.csv")
-data1<-fread("2020.10.03.csv")
-data=rbind(data,data1)
-#selecting few rows
-
-
-library(fastDummies)
-library(ggplot2)
-library(plotly)
-library(GGally)
-#finding missing values in each column
-colSums(is.na(data))
-#each column missing values box plot comes here
-#taking non missing rows alone
-data_na_removed = na.omit(data)
-#checking if missing values are gone
-colSums(is.na(data_na_removed))
-#box plot of label column comes here
-#checking unique values
-unique(data_na_removed$label)
-
-# 1 - Benign 2 - Malicious 3 - Outlier
-
-data_na_removed$label = factor(data_na_removed$label,
-                               levels = c("benign", "malicious", "outlier"),
-                               labels = c(1, 2, 3))
-summary(data_na_removed$label)
-data_na_removed
-#summary(data_na_removed)
-
-#ggpairs(data_na_removed)
-
-#options(scipen = 999)
-
-#ggplot(na.omit(data), aes(x=label, colour = label, fill = label), stat = "count") + geom_bar() +
-#  ggtitle("Distibution of Labels in Dataset") + 
-#  labs(y = "Number of Cases", x = "Type of Label")
-
-
-#cor.test(data_na_removed$entropy, as.numeric(data_na_removed$label))
-
-#data_na_removed$label = as.numeric(data_na_removed$label)
-
-#data_na_removed = data_na_removed[, -c(12, 13)]
-
-
-#ggplot(data_na_removed, aes(x = entropy)) + geom_bar() + 
-#  facet_wrap(~label)
-
-
-#hist(data_na_removed$entropy, bins = 10)
-data_na_removed$label=as.factor(data_na_removed$label)
-training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
-train_set=data_na_removed[training,]
-test_set=data_na_removed[-training,]
-head(train_set)
-model=train(data=train_set,label~.,method="nnet",tuneGrid=expand.grid(.size=c(5), .decay=0.1),trControl=trainControl(method="none",seeds = 123),MaxNWts=100,maxit=100)
-confusionMatrix(train_set$label,predict(model,data=train_set))
-test_set$test_pred=predict(model,newdata = test_set[,-15])
-confusionMatrix(test_set$label, test_set$test_pred)
-aggregate()
-
-
-####kmeans
-install.packages("ClusterR")
-install.packages("cluster")
-library(ClusterR)
-library(cluster)
-data_na_removed
-dendogram=hclust(dist(data,method="euclidean"),method="complete")
-data1=data_na_removed[,-15]
-data1
-kmeans1<- kmeans(data1, centers = 3)
-cm=table(data_na_removed$label, kmeans1$cluster)
-cm
-confusionMatrix(cm)
-
-###cart
-data_na_removed
-data1=data_na_removed[,c(4,5,10,11,15)]
-data1
-training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
-train_set=data_na_removed[training,]
-test_set=data_na_removed[-training,]
-model=train(data=train_set,label~.,method="rpart")
-confusionMatrix(predict(model,new_data=test_set),train_set$label)
-
-##doing data cleaning
-
-data_na_removed=data_na_removed%>%mutate(timediff=time_end-time_start)
-data_na_removed$time_end<-NULL
-data_na_removed$time_start<-NULL
-#SELECTING ONLY POSITIVES
-data_na_removed=data_na_removed[data_na_removed$timediff>=1]
-data_na_removed
-
-data_na_removed=data_na_removed[data_na_removed$dest_ip%in% c(786  , 15169  ,202425  , 61337  , 49453   ,45899  ,  7713  , 16276 ,  49505,
-                                                              57172  , 43350)]
-data_na_removed$dest_ip=as.factor(data_na_removed$dest_ip)
-summary(data_na_removed$dest_ip)
-data_na_removed=data_na_removed[data_na_removed$dest_port %in% c(445,9200,22,5900,5060,53  ,  5060 ,     23 ,    123 ,  33522,   33524 ,
-                                                                 33518,   33504 ,  33520,33524 ,  33518,   33504,   33520,
-                                                                 33526  ,  3389 ,  33514,   33512 ,  60490 ,  60506,   60512 ,  60510)]
-data_na_removed$dest_port=as.factor(data_na_removed$dest_port )
-summary(data_na_removed$dest_port)
-data_na_removed=data_na_removed[data_na_removed$src_ip %in% c(786  , 45899  ,202425  ,  7552 ,   7713 ,  49453   , 8048  , 18403 ,  16276 ,  43350  ,213371 ,
-                                                              4134 ,  34665,12389 , 200019 ,  57172,    9299 ,  12876,    8452  ,  3462,
-                                                              25019 ,  24961 ,  55836 ,  45820 ,   8151  , 45090,45595 ,   9498  , 45903,   47331  ,  4812 ,   9121 ,
-                                                              6503 ,   9484  ,  4837 ,   8376  , 15895,    9009  ,  6057 )]
-
-data_na_removed$src_ip=as.factor(data_na_removed$src_ip)
-summary(data_na_removed$src_ip)
-data_na_removed=data_na_removed[data_na_removed$src_port %in% c(9200 ,  33504  , 33524 ,  33518 ,  33514 ,  33522 ,  60510,
-                                                                33512,   60516 ,  33526 ,  60490  , 33520  , 60512 ,60506 ,  60514 ,  60518 ,  60508,   55336  , 55330,   55332 ,
-                                                                55334 ,    123,   53278  , 53020 ,  32651 ,  26042)]
-
-
-data_na_removed$src_port=as.factor(data_na_removed$src_port)
-summary(data_na_removed$src_port)
-data_na_removed
-dmy <- dummyVars(" ~dest_ip+dest_port+src_ip+src_port", data = data_na_removed)
-trsf <- data.frame(predict(dmy, newdata = data_na_removed))
-data_na_removed=cbind(data_na_removed,trsf)
-data_na_removed=data_na_removed[,c(-4,-5,-10,-11)]
-data_na_removed$timediff<-NULL
-data_na_removed
-data_na_removed$avg_ipt=scale(data_na_removed$avg_ipt)
-data_na_removed$bytes_in=scale(data_na_removed$bytes_in)
-data_na_removed$bytes_out=scale(data_na_removed$bytes_out)
-data_na_removed$entropy=scale(data_na_removed$entropy)
-data_na_removed$num_pkts_out=scale(data_na_removed$num_pkts_out)
-data_na_removed$proto=scale(data_na_removed$proto)
-data_na_removed$total_entropy=scale(data_na_removed$total_entropy)
-data_na_removed$duration=scale(data_na_removed$duration)
-summary(data_na_removed)
-
-
-