diff options
| author | Bobby <[email protected]> | 2022-04-29 18:25:57 -0400 |
|---|---|---|
| committer | Bobby <[email protected]> | 2022-04-29 18:25:57 -0400 |
| commit | f041a7988bf2216e485e3f349fdd0182d96a410f (patch) | |
| tree | c13d9a0214900ca98add9b3864c252e4f761df31 | |
| parent | 5dbd34fbc10b190d6c3934db21a2918757c2a132 (diff) | |
| download | Network-Intrusion-Detection-f041a7988bf2216e485e3f349fdd0182d96a410f.tar.xz Network-Intrusion-Detection-f041a7988bf2216e485e3f349fdd0182d96a410f.zip | |
add readme
| -rw-r--r-- | README.md | 36 | ||||
| -rw-r--r-- | project.R | 142 |
2 files changed, 36 insertions, 142 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..cf56ae5 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# Network Intrusion Detection +A Comprehensive Approach To Analysis and Detection of Emerging Threats due to Network Intrusion + +## Required Tools + +Some tools are required to run the project. + +- [RStudio](https://www.rstudio.com/) +- [WGet](https://www.gnu.org/software/wget/) + +## Downloading the Dataset + +To download the dataset, use the [`dataset_downloader.sh`](dataset_downloader.sh) script on UNIX, Linux, or MacOS. + +```bash +$ chmod +x dataset_downloader.sh +$ ./dataset_downloader.sh +``` + +To download the dataset, use the [`dataset_downloader.bat`](dataset_downloader.bat) script on Windows. + +## Starting the Project + +To start the project, you need to build the models in RStudio. Run the [models.R](models.R) script in RStudio. + +There are 4 models to build: +- Deep Learning Model +- Distributed Random Forest Model +- Gradient Boosting Machine Model +- Naiive Bayes Model + +You can add more models to the project by adding them to the [models.R](models.R) script and importing them in the [app.R](app.R) script. + +In order to run the [R Shiny App](https://shiny.rstudio.com/), you need to build the project in RStudio. Run the [app.R](app.R) script in RStudio. + + diff --git a/project.R b/project.R deleted file mode 100644 index 2320cb3..0000000 --- a/project.R +++ /dev/null @@ -1,142 +0,0 @@ -library(data.table) -library(lattice) -library(caret) -library(nnet) -data <- fread("2020.10.02.csv") -data1<-fread("2020.10.03.csv") -data=rbind(data,data1) -#selecting few rows - - -library(fastDummies) -library(ggplot2) -library(plotly) -library(GGally) -#finding missing values in each column -colSums(is.na(data)) -#each column missing values box plot comes here -#taking non missing rows alone -data_na_removed = na.omit(data) -#checking if missing values are gone -colSums(is.na(data_na_removed)) -#box plot of label column comes here -#checking unique values -unique(data_na_removed$label) - -# 1 - Benign 2 - Malicious 3 - Outlier - -data_na_removed$label = factor(data_na_removed$label, - levels = c("benign", "malicious", "outlier"), - labels = c(1, 2, 3)) -summary(data_na_removed$label) -data_na_removed -#summary(data_na_removed) - -#ggpairs(data_na_removed) - -#options(scipen = 999) - -#ggplot(na.omit(data), aes(x=label, colour = label, fill = label), stat = "count") + geom_bar() + -# ggtitle("Distibution of Labels in Dataset") + -# labs(y = "Number of Cases", x = "Type of Label") - - -#cor.test(data_na_removed$entropy, as.numeric(data_na_removed$label)) - -#data_na_removed$label = as.numeric(data_na_removed$label) - -#data_na_removed = data_na_removed[, -c(12, 13)] - - -#ggplot(data_na_removed, aes(x = entropy)) + geom_bar() + -# facet_wrap(~label) - - -#hist(data_na_removed$entropy, bins = 10) -data_na_removed$label=as.factor(data_na_removed$label) -training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE) -train_set=data_na_removed[training,] -test_set=data_na_removed[-training,] -head(train_set) -model=train(data=train_set,label~.,method="nnet",tuneGrid=expand.grid(.size=c(5), .decay=0.1),trControl=trainControl(method="none",seeds = 123),MaxNWts=100,maxit=100) -confusionMatrix(train_set$label,predict(model,data=train_set)) -test_set$test_pred=predict(model,newdata = test_set[,-15]) -confusionMatrix(test_set$label, test_set$test_pred) -aggregate() - - -####kmeans -install.packages("ClusterR") -install.packages("cluster") -library(ClusterR) -library(cluster) -data_na_removed -dendogram=hclust(dist(data,method="euclidean"),method="complete") -data1=data_na_removed[,-15] -data1 -kmeans1<- kmeans(data1, centers = 3) -cm=table(data_na_removed$label, kmeans1$cluster) -cm -confusionMatrix(cm) - -###cart -data_na_removed -data1=data_na_removed[,c(4,5,10,11,15)] -data1 -training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE) -train_set=data_na_removed[training,] -test_set=data_na_removed[-training,] -model=train(data=train_set,label~.,method="rpart") -confusionMatrix(predict(model,new_data=test_set),train_set$label) - -##doing data cleaning - -data_na_removed=data_na_removed%>%mutate(timediff=time_end-time_start) -data_na_removed$time_end<-NULL -data_na_removed$time_start<-NULL -#SELECTING ONLY POSITIVES -data_na_removed=data_na_removed[data_na_removed$timediff>=1] -data_na_removed - -data_na_removed=data_na_removed[data_na_removed$dest_ip%in% c(786 , 15169 ,202425 , 61337 , 49453 ,45899 , 7713 , 16276 , 49505, - 57172 , 43350)] -data_na_removed$dest_ip=as.factor(data_na_removed$dest_ip) -summary(data_na_removed$dest_ip) -data_na_removed=data_na_removed[data_na_removed$dest_port %in% c(445,9200,22,5900,5060,53 , 5060 , 23 , 123 , 33522, 33524 , - 33518, 33504 , 33520,33524 , 33518, 33504, 33520, - 33526 , 3389 , 33514, 33512 , 60490 , 60506, 60512 , 60510)] -data_na_removed$dest_port=as.factor(data_na_removed$dest_port ) -summary(data_na_removed$dest_port) -data_na_removed=data_na_removed[data_na_removed$src_ip %in% c(786 , 45899 ,202425 , 7552 , 7713 , 49453 , 8048 , 18403 , 16276 , 43350 ,213371 , - 4134 , 34665,12389 , 200019 , 57172, 9299 , 12876, 8452 , 3462, - 25019 , 24961 , 55836 , 45820 , 8151 , 45090,45595 , 9498 , 45903, 47331 , 4812 , 9121 , - 6503 , 9484 , 4837 , 8376 , 15895, 9009 , 6057 )] - -data_na_removed$src_ip=as.factor(data_na_removed$src_ip) -summary(data_na_removed$src_ip) -data_na_removed=data_na_removed[data_na_removed$src_port %in% c(9200 , 33504 , 33524 , 33518 , 33514 , 33522 , 60510, - 33512, 60516 , 33526 , 60490 , 33520 , 60512 ,60506 , 60514 , 60518 , 60508, 55336 , 55330, 55332 , - 55334 , 123, 53278 , 53020 , 32651 , 26042)] - - -data_na_removed$src_port=as.factor(data_na_removed$src_port) -summary(data_na_removed$src_port) -data_na_removed -dmy <- dummyVars(" ~dest_ip+dest_port+src_ip+src_port", data = data_na_removed) -trsf <- data.frame(predict(dmy, newdata = data_na_removed)) -data_na_removed=cbind(data_na_removed,trsf) -data_na_removed=data_na_removed[,c(-4,-5,-10,-11)] -data_na_removed$timediff<-NULL -data_na_removed -data_na_removed$avg_ipt=scale(data_na_removed$avg_ipt) -data_na_removed$bytes_in=scale(data_na_removed$bytes_in) -data_na_removed$bytes_out=scale(data_na_removed$bytes_out) -data_na_removed$entropy=scale(data_na_removed$entropy) -data_na_removed$num_pkts_out=scale(data_na_removed$num_pkts_out) -data_na_removed$proto=scale(data_na_removed$proto) -data_na_removed$total_entropy=scale(data_na_removed$total_entropy) -data_na_removed$duration=scale(data_na_removed$duration) -summary(data_na_removed) - - - |
