aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBobby <[email protected]>2022-04-27 20:57:47 -0400
committerBobby <[email protected]>2022-04-27 20:57:47 -0400
commit4820317683754e8e1faf9dbf456c20b7f27f637a (patch)
tree3a93ba15e0b2562338829f39ff4c3481ad88d44d
downloadNetwork-Intrusion-Detection-dataset.tar.xz
Network-Intrusion-Detection-dataset.zip
basic R shiny app with data visualizationdataset
-rw-r--r--.gitignore41
-rw-r--r--app.R233
-rw-r--r--project.R142
3 files changed, 416 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9fb7372
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,41 @@
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+.RData*
+
+# User-specific files
+.Ruserdata
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+*.csv
diff --git a/app.R b/app.R
new file mode 100644
index 0000000..ceff262
--- /dev/null
+++ b/app.R
@@ -0,0 +1,233 @@
+#
+# This is a Shiny web application. You can run the application by clicking
+# the 'Run App' button above.
+#
+# Find out more about building applications with Shiny here:
+#
+# http://shiny.rstudio.com/
+#
+library(shinythemes)
+library(shiny)
+library(DT)
+library(data.table)
+library(ggplot2)
+library(shinycssloaders)
+
+
+# Defining Non Changing Variables
+data <- fread("2020.10.01.csv")
+data_na_removed <- na.omit(data)
+
+# Encoding the Label Column
+# 1 - Benign 2 - Malicious 3 - Outlier
+data_encoded <- data_na_removed
+data_encoded$label <- factor(data_na_removed$label,
+ levels = c("benign", "malicious", "outlier"),
+ labels = c(1, 2, 3))
+data_encoded$label = as.numeric(data_encoded$label)
+# Define Default Values
+pch = 16
+features <- c("Average Input","Incoming Bytes","Outgoing Bytes",
+ "Destination IP", "Destination Port", "Entropy",
+ "Inbound Packets", "Outbound Packets", "Protocol",
+ "Source IP", "Source Port", "Start Time (s)",
+ "End Time (s)", "Total Entropy", "Type", "Duration")
+feature_variables <- c("avg_ipt", "bytes_in", "bytes_out", "dest_ip",
+ "dest_port", "entropy", "num_pkts_out", "num_pkts_in",
+ "proto", "src_ip", "src_port", "time_end", "time_start",
+ "total_entropy", "label", "duration")
+
+# Define Elementary Functions
+get_color <- function(a = 1) {
+ return(alpha("#e95420", a))
+}
+
+# Define UI for application
+ui <- fluidPage(
+ theme = shinytheme("united"),
+ # Application title
+ titlePanel("A Comprehensive Approach To Analysis and Detection of Emerging
+ Threats due to Network Intrusion"),
+
+ navbarPage(
+ "Network Intrusion Detection Demo",
+ tabPanel(
+ icon("home"),
+ p("Through this application, it is intended to develop a demo of a",
+ strong("Network Intrusion Detection System"),
+ "using different Machine Learning Techniques using the
+ LUFlow Network Intrusion Detection Data Set. This page is intended
+ to display the information about the dataset."
+ ,style="text-align:justify;color:black;
+ background-color:lavender;padding:15px;border-radius:10px"),
+ br(),
+ p("The data used in this application are publicly available on the",
+ em("LUFlow Network Intrusion Detection Data Set"), "Kaggle page.
+ The Data Set contains telemetry cap- tured using Cisco’s Joy tool.
+ This tool records multiple measurements asso- ciated with flows.
+ Features are engineered from these measurements, which are also
+ outlined below",style="text-align:justify;color:black;
+ background-color:papayawhip;padding:15px;border-radius:10px"),
+ hr(),
+ tags$style(".fa-database {color:#e95420}"),
+ h3(p(icon("database",lib = "font-awesome"),
+ em("Dataset Exploration "),
+ style="color:black;text-align:center")),
+ fluidRow(column(DT::dataTableOutput("renderData"),
+ width = 12)),
+ hr(),
+ p(em("Developed by"), br("Kumar Priyansh, Ritu Dimri,
+ Sandeep Perumalla, Hemanth Katikala"),
+ style="text-align:center; font-family: times")
+ ),
+ tabPanel(
+ "Data Visualization",
+ p("This part allows you to visualize features via different types of
+ plots. You can select whatever features you want to plot and hit
+ the \"Plot Graph\" button. Please keep in mind that all plots",
+ strong("might not be useful"),
+ "and you need to select which plots you want to visualize. If you
+ want to save an image of the currently visualized plot, please
+ right click on the plot and click on the relevant",
+ strong("save image"),
+ "option."
+ ,style="text-align:justify;color:black;
+ background-color:lavender;padding:15px;border-radius:10px"),
+ sidebarLayout(
+ sidebarPanel(
+ selectInput(
+ "plotType",
+ p("Type of Plot:"),
+ choices = c(Histogram = "hist",
+ "Scatter Plot" = "scatter",
+ "Mosaic Plot" = "mosaic")
+ ),
+ # Only show this panel if the plot type is a histogram
+ conditionalPanel(
+ condition = "input.plotType == 'hist'",
+ selectInput(
+ "plotVariable",
+ p("Feature to Visualize:"),
+ choices = features
+ ),
+ selectInput(
+ "plotVariant",
+ p("Plot Variant:"),
+ choices = c("Normal", "Log 10 Scale")
+ )
+ ),
+
+ # Only show this panel if the plot type is a scatter plot
+ conditionalPanel(
+ condition = "input.plotType == 'scatter'",
+ selectInput(
+ "plotVariable1",
+ p("First Feature to Visualize:"),
+ choices = features
+ ),
+ uiOutput("secondSelection")
+ ),
+
+ # Single Mosiac Plot for now
+ conditionalPanel(
+ condition = "input.plotType == 'mosaic'",
+ selectInput(
+ "mosaicVariable",
+ p("Select Features to Visualize:"),
+ choices = c("Labels vs Protocols" = "labproto")
+ )
+ ),
+ actionButton("plot", "Plot Graph",
+ width = "100%", icon = icon("chart-line"),
+ style="color: #fff; background-color: #e95420;
+ outline: none")
+ ),
+ mainPanel(
+ withSpinner(
+ plotOutput("selectedFeatureVariableForVisualization"),
+ type = 6, color = "#e95420"
+ )
+ )
+ )
+ ),
+ tabPanel(
+ "Compare Models"
+ )
+ )
+)
+
+# Define server logic
+server <- function(input, output) {
+ output$renderData <- DT::renderDataTable(
+ DT::datatable({
+ data_na_removed
+ },
+ options = list(
+ initComplete = JS(
+ "function(settings, json) {",
+ "$(this.api().table().header()).css({'background-color':
+ 'moccasin', 'color': '1c1b1b'});",
+ "}"),
+ columnDefs=list(list(className='dt-center',targets="_all"))),
+ style = 'bootstrap',
+ class = 'cell-border stripe',
+ rownames = FALSE,
+ colnames = features)
+ )
+
+ output$secondSelection <- renderUI({
+ selectedFeature <- input$plotVariable1
+ selectInput(
+ "plotVariable2",
+ p("Second Feature to Visualize:"),
+ choices = features[!features %in% selectedFeature]
+ )
+ })
+
+ output$selectedFeatureVariableForVisualization <- renderPlot({
+ input$plot
+ isolate({
+ plotType <- input$plotType
+ if (plotType == 'hist') {
+ selectedFeature <- input$plotVariable
+ plotVariant <- input$plotVariant
+ positionInFeatureArray <- which(features == selectedFeature)
+ selectedFeatureVariable <- feature_variables[positionInFeatureArray]
+ if (plotVariant == "Normal") {
+ hist(data_encoded[[selectedFeatureVariable]],
+ main = paste("Histogram Plot of", selectedFeature, sep = " ", collapse = NULL),
+ ylab = "Frequency", xlab = selectedFeature,
+ col = get_color(), pch = pch)
+ } else {
+ nonZeroSelectedFeature = data_encoded[data_encoded[[selectedFeatureVariable]] > 0]
+ hist(log(nonZeroSelectedFeature[[selectedFeatureVariable]]),
+ main = paste("Log 10 Base Histogram Plot of", selectedFeature, sep = " ", collapse = NULL),
+ ylab = "Frequency", xlab = selectedFeature,
+ col = get_color(), pch = pch)
+ }
+ } else if (plotType == 'scatter') {
+ firstFeature <- feature_variables[which(features ==
+ input$plotVariable1)]
+ secondFeature <- feature_variables[which(features ==
+ input$plotVariable2)]
+ try(plot(data_encoded[[firstFeature]], data_encoded[[secondFeature]],
+ main = paste("Scatter Plot of", input$plotVariable1,
+ "vs", input$plotVariable2, sep = " ", collapse = NULL),
+ ylab = input$plotVariable2, xlab = input$plotVariable1,
+ col = get_color(0.02),
+ pch = 16,), silent = TRUE)
+ } else {
+ selectedFeatures <- input$mosaicVariable
+ if (selectedFeatures == 'labproto') {
+ proto_label_mosaic <- table(data_encoded$proto, data_encoded$label)
+ mosaicplot(~ factor(proto)+factor(label, labels=c("benign","malicious","outlier")),
+ data = data_encoded,xlab = "Protocol", ylab = "Category",
+ main= "Mosaic plot of Protocol vs Category",shade = TRUE)
+ }
+ }
+ })
+ })
+}
+
+# Run the application
+shinyApp(ui = ui, server = server)
diff --git a/project.R b/project.R
new file mode 100644
index 0000000..2320cb3
--- /dev/null
+++ b/project.R
@@ -0,0 +1,142 @@
+library(data.table)
+library(lattice)
+library(caret)
+library(nnet)
+data <- fread("2020.10.02.csv")
+data1<-fread("2020.10.03.csv")
+data=rbind(data,data1)
+#selecting few rows
+
+
+library(fastDummies)
+library(ggplot2)
+library(plotly)
+library(GGally)
+#finding missing values in each column
+colSums(is.na(data))
+#each column missing values box plot comes here
+#taking non missing rows alone
+data_na_removed = na.omit(data)
+#checking if missing values are gone
+colSums(is.na(data_na_removed))
+#box plot of label column comes here
+#checking unique values
+unique(data_na_removed$label)
+
+# 1 - Benign 2 - Malicious 3 - Outlier
+
+data_na_removed$label = factor(data_na_removed$label,
+ levels = c("benign", "malicious", "outlier"),
+ labels = c(1, 2, 3))
+summary(data_na_removed$label)
+data_na_removed
+#summary(data_na_removed)
+
+#ggpairs(data_na_removed)
+
+#options(scipen = 999)
+
+#ggplot(na.omit(data), aes(x=label, colour = label, fill = label), stat = "count") + geom_bar() +
+# ggtitle("Distibution of Labels in Dataset") +
+# labs(y = "Number of Cases", x = "Type of Label")
+
+
+#cor.test(data_na_removed$entropy, as.numeric(data_na_removed$label))
+
+#data_na_removed$label = as.numeric(data_na_removed$label)
+
+#data_na_removed = data_na_removed[, -c(12, 13)]
+
+
+#ggplot(data_na_removed, aes(x = entropy)) + geom_bar() +
+# facet_wrap(~label)
+
+
+#hist(data_na_removed$entropy, bins = 10)
+data_na_removed$label=as.factor(data_na_removed$label)
+training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
+train_set=data_na_removed[training,]
+test_set=data_na_removed[-training,]
+head(train_set)
+model=train(data=train_set,label~.,method="nnet",tuneGrid=expand.grid(.size=c(5), .decay=0.1),trControl=trainControl(method="none",seeds = 123),MaxNWts=100,maxit=100)
+confusionMatrix(train_set$label,predict(model,data=train_set))
+test_set$test_pred=predict(model,newdata = test_set[,-15])
+confusionMatrix(test_set$label, test_set$test_pred)
+aggregate()
+
+
+####kmeans
+install.packages("ClusterR")
+install.packages("cluster")
+library(ClusterR)
+library(cluster)
+data_na_removed
+dendogram=hclust(dist(data,method="euclidean"),method="complete")
+data1=data_na_removed[,-15]
+data1
+kmeans1<- kmeans(data1, centers = 3)
+cm=table(data_na_removed$label, kmeans1$cluster)
+cm
+confusionMatrix(cm)
+
+###cart
+data_na_removed
+data1=data_na_removed[,c(4,5,10,11,15)]
+data1
+training=createDataPartition(data_na_removed$label,p=0.6,list=FALSE)
+train_set=data_na_removed[training,]
+test_set=data_na_removed[-training,]
+model=train(data=train_set,label~.,method="rpart")
+confusionMatrix(predict(model,new_data=test_set),train_set$label)
+
+##doing data cleaning
+
+data_na_removed=data_na_removed%>%mutate(timediff=time_end-time_start)
+data_na_removed$time_end<-NULL
+data_na_removed$time_start<-NULL
+#SELECTING ONLY POSITIVES
+data_na_removed=data_na_removed[data_na_removed$timediff>=1]
+data_na_removed
+
+data_na_removed=data_na_removed[data_na_removed$dest_ip%in% c(786 , 15169 ,202425 , 61337 , 49453 ,45899 , 7713 , 16276 , 49505,
+ 57172 , 43350)]
+data_na_removed$dest_ip=as.factor(data_na_removed$dest_ip)
+summary(data_na_removed$dest_ip)
+data_na_removed=data_na_removed[data_na_removed$dest_port %in% c(445,9200,22,5900,5060,53 , 5060 , 23 , 123 , 33522, 33524 ,
+ 33518, 33504 , 33520,33524 , 33518, 33504, 33520,
+ 33526 , 3389 , 33514, 33512 , 60490 , 60506, 60512 , 60510)]
+data_na_removed$dest_port=as.factor(data_na_removed$dest_port )
+summary(data_na_removed$dest_port)
+data_na_removed=data_na_removed[data_na_removed$src_ip %in% c(786 , 45899 ,202425 , 7552 , 7713 , 49453 , 8048 , 18403 , 16276 , 43350 ,213371 ,
+ 4134 , 34665,12389 , 200019 , 57172, 9299 , 12876, 8452 , 3462,
+ 25019 , 24961 , 55836 , 45820 , 8151 , 45090,45595 , 9498 , 45903, 47331 , 4812 , 9121 ,
+ 6503 , 9484 , 4837 , 8376 , 15895, 9009 , 6057 )]
+
+data_na_removed$src_ip=as.factor(data_na_removed$src_ip)
+summary(data_na_removed$src_ip)
+data_na_removed=data_na_removed[data_na_removed$src_port %in% c(9200 , 33504 , 33524 , 33518 , 33514 , 33522 , 60510,
+ 33512, 60516 , 33526 , 60490 , 33520 , 60512 ,60506 , 60514 , 60518 , 60508, 55336 , 55330, 55332 ,
+ 55334 , 123, 53278 , 53020 , 32651 , 26042)]
+
+
+data_na_removed$src_port=as.factor(data_na_removed$src_port)
+summary(data_na_removed$src_port)
+data_na_removed
+dmy <- dummyVars(" ~dest_ip+dest_port+src_ip+src_port", data = data_na_removed)
+trsf <- data.frame(predict(dmy, newdata = data_na_removed))
+data_na_removed=cbind(data_na_removed,trsf)
+data_na_removed=data_na_removed[,c(-4,-5,-10,-11)]
+data_na_removed$timediff<-NULL
+data_na_removed
+data_na_removed$avg_ipt=scale(data_na_removed$avg_ipt)
+data_na_removed$bytes_in=scale(data_na_removed$bytes_in)
+data_na_removed$bytes_out=scale(data_na_removed$bytes_out)
+data_na_removed$entropy=scale(data_na_removed$entropy)
+data_na_removed$num_pkts_out=scale(data_na_removed$num_pkts_out)
+data_na_removed$proto=scale(data_na_removed$proto)
+data_na_removed$total_entropy=scale(data_na_removed$total_entropy)
+data_na_removed$duration=scale(data_na_removed$duration)
+summary(data_na_removed)
+
+
+