# Import necessary libraries library(data.table) library(caret) library(h2o) localH2O = h2o.init() # Importing the Network Intrusion Data set dataset <- fread("2020.10.01.csv") dataset <- na.omit(dataset) dataset <- dataset[, -c(12, 13)] correlationSet <- dataset # Encoding 'label' as Catagorical Variable dataset$label <- factor(dataset$label, levels = c("benign", "malicious", "outlier"), labels = c(1, 2, 3)) correlationSet$label <- factor(correlationSet$label, levels = c("benign", "malicious", "outlier"), labels = c(1, 2, 3)) correlationSet$label <- as.numeric(correlationSet$label) # Remove Redundant Features - First Find Correlated Features correlationMatrix <- cor(correlationSet) highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.5) print(highlyCorrelated) df <- dataset[, c(8,2,7,3,5,12,13)] df <- as.h2o(dataset) head(dataset[, c(8,2,7,3,5,12,13)]) # set the predictor and response columns predictors <- c("num_pkts_in", "bytes_in", "num_pkts_out", "bytes_out", "dest_port", "total_entropy") response <- "label" # split the dataset into train and test sets df_splits <- h2o.splitFrame(data = df, ratios = 0.8) train <- df_splits[[1]] test <- df_splits[[2]] # Build and train Deep learning model: dl <- h2o.deeplearning(x = 1:6, y = "label", distribution = "multinomial", hidden = c(1), epochs = 100, train_samples_per_iteration = -1, reproducible = TRUE, activation = "Tanh", single_node_mode = FALSE, balance_classes = FALSE, force_load_balance = FALSE, seed = 23123, score_training_samples = 0, score_validation_samples = 0, training_frame = df, stopping_rounds = 0) # Eval performance of deep learning model: perf <- h2o.performance(dl) perf # Generate predictions on a test set (if necessary): pred <- h2o.predict(dl, newdata = df) summary(dl) # Save the model dl_model <- h2o.saveModel(object = dl, path = "/Users/lucifer/Documents/projects/NetworkIntrusionDetection/models", force = TRUE) print(dl_model) # Build and train distributed random forest model: drf <- h2o.randomForest(x = predictors, y = response, ntrees = 10, max_depth = 5, min_rows = 10, calibration_frame = test, binomial_double_trees = TRUE, training_frame = train, validation_frame = test) # Eval Performance of distributed random forest model: h2o.performance(drf) summary(dl) # Save the model drf_model <- h2o.saveModel(object = drf, path = "/Users/lucifer/Documents/projects/NetworkIntrusionDetection/models", force = TRUE) # Build and train the Gradient Boosting machine model: gbm <- h2o.gbm(x = predictors, y = response, nfolds = 5, seed = 1111, keep_cross_validation_predictions = TRUE, training_frame = df) # Eval Performance of GBM model: h2o.performance(gbm) summary(dl) # Save the model gbm_model <- h2o.saveModel(object = gbm, path = "/Users/lucifer/Documents/projects/NetworkIntrusionDetection/models", force = TRUE) # Build and train the Naive Bayes model: nb <- h2o.naiveBayes(x = predictors, y = response, training_frame = df, laplace = 0, nfolds = 5, seed = 1234) # Eval performance of the Naive Bayes: h2o.performance(nb) summary(nb) nb_model <- h2o.saveModel(object = nb, path = "/Users/lucifer/Documents/projects/NetworkIntrusionDetection/models", force = TRUE)