From 23191bdaaf9156e201b2808006e16a9ce9eb9db6 Mon Sep 17 00:00:00 2001 From: Sayed Saeedi <sayed.saeedi@stud.th-deg.de> Date: Sat, 2 Mar 2024 02:08:28 +0100 Subject: [PATCH] codes for preprocessing --- Preprocessing/All_the_datasets.ipynb | 209 +++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 Preprocessing/All_the_datasets.ipynb diff --git a/Preprocessing/All_the_datasets.ipynb b/Preprocessing/All_the_datasets.ipynb new file mode 100644 index 0000000..17455ea --- /dev/null +++ b/Preprocessing/All_the_datasets.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6e35f5e6-c85e-4d41-a019-ded9fef99bae", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c0aae72-18f5-48f2-a4ff-a039da52ff35", + "metadata": {}, + "outputs": [], + "source": [ + "#loading all files from the location\n", + "pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines\n", + "#loading files\n", + "file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',\n", + " '02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']\n", + "\n", + "#loading all dataset in a dictionary\n", + "dfs={}\n", + "for file in file_names:\n", + " df=pd.read_csv(f'~/Datasets/{file}')\n", + " dfs[file]=df\n", + " \n", + "#Droping [\"Flow ID\", \"Src IP\", \"Src Port\", \"Dst IP\"] columns from 02-20-2018 file\n", + "dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3", + "metadata": {}, + "outputs": [], + "source": [ + "def print_label_counts(dfs):\n", + " \"\"\"\n", + " counting the different categories in each Label\n", + " Parameters:\n", + " - dfs: Dictionary of DataFrames.\n", + " \"\"\"\n", + " for key in dfs.keys():\n", + " df = dfs[key] # Get the dataframe corresponding to the key\n", + " count = df['Label'].value_counts() # Perform value count on the 'Label' column\n", + " print(f\"Value counts for dataframe '{key}':\\n{count}\\n\")\n", + "\n", + "print_label_counts(dfs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fc6eb2d-8451-4cc4-9ae0-029182366994", + "metadata": {}, + "outputs": [], + "source": [ + "#preprocessing\n", + "\"\"\"\n", + "-Deleting duplicates\n", + "-changing inf and -inf to NaN\n", + "-changing time to unix format\n", + "-changing data types to numeric except for the 'Label' column\n", + "-dropping Na\n", + "-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']\n", + "\"\"\"\n", + "for key in dfs.keys():\n", + " df=dfs[key]\n", + " print(f\"Dataframe: '{key}', shape before preprocessing: {df.shape}\")\n", + " df.drop_duplicates(inplace=True) \n", + " df.replace([np.inf, -np.inf], np.nan, inplace=True) \n", + " \n", + " df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n", + " df['Timestamp'] = (df['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n", + " \n", + " for col in df.columns: #changing to numeric if not, then to NaN\n", + " if df[col].dtype == 'object' and col != 'Label':\n", + " df[col] = pd.to_numeric(df[col], errors='coerce')\n", + "\n", + "\n", + " df.dropna(inplace=True)\n", + "\n", + " for col in df.columns:\n", + " if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:\n", + " df.loc[df[col] < 0, col] = np.nan\n", + "\n", + " df.replace([np.inf, -np.inf], np.nan, inplace=True)\n", + " \n", + " df.dropna(inplace=True)\n", + " print(f\"shape after preprocessing: {df.shape}\\n\") \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c3d70f5-7052-4e6c-bdce-fe36dff94008", + "metadata": {}, + "outputs": [], + "source": [ + "def aggregate_classes(dfs, classes):\n", + " \"\"\"\n", + " Aggregates traffic data into separate DataFrames based on specified labels.\n", + "\n", + " Parameters:\n", + " - dfs: Dictionary of DataFrames loaded from CSV files.\n", + " - classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.\n", + "\n", + " Returns:\n", + " - A dictionary of aggregated DataFrames for each category.\n", + " \"\"\"\n", + " aggregated_data = {}\n", + " \n", + " for category, labels in classes.items():\n", + " aggregated_data[category] = pd.DataFrame()\n", + " \n", + " for label in labels:\n", + " # Iterating through all DataFrames to filter and aggregate the different labels\n", + " for key in dfs:\n", + " df = dfs[key]\n", + " filtered_df = df[df[\"Label\"] == label]\n", + " aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)\n", + " \n", + " return aggregated_data\n", + "\n", + "\n", + "# the specified labels\n", + "classes = {\n", + " \"BruteForce\": [\"FTP-BruteForce\", \"SSH-Bruteforce\", \"Brute Force -Web\", \"Brute Force -XSS\"],\n", + " \"DoS\": [\"DoS attacks-GoldenEye\", \"DoS attacks-Slowloris\", \"DoS attacks-Hulk\", \"DoS attacks-SlowHTTPTest\", \"DDoS attacks-LOIC-HTTP\", \"DDOS attack-HOIC\", \"DDOS attack-LOIC-UDP\"],\n", + " \"Infiltration\": [\"Infilteration\"],\n", + " \"Bot\": [\"Bot\"],\n", + " \"Benign\": [\"Benign\"]\n", + "}\n", + "\n", + "aggregated_data = aggregate_classes(dfs, classes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7062976-df28-43f5-8a86-226c87838123", + "metadata": {}, + "outputs": [], + "source": [ + "# Access the aggregated DataFrames for each category\n", + "bruteforce_attacks = aggregated_data[\"BruteForce\"]\n", + "doS_attacks = aggregated_data[\"DoS\"]\n", + "infiltration_attacks = aggregated_data[\"Infiltration\"]\n", + "bot_attacks = aggregated_data[\"Bot\"]\n", + "benign = aggregated_data[\"Benign\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec01bd77-2f10-42fb-b48f-babe38204108", + "metadata": {}, + "outputs": [], + "source": [ + "#Saving each traffic category\n", + "save_directory = 'Datasets/Preprocessed_Datasets'\n", + "\n", + "bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)\n", + "doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)\n", + "infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)\n", + "bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)\n", + "benign.to_csv(f'{save_directory}/benign.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8806c851-f3a6-4962-9c4b-0e2e226a6901", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab