From 23191bdaaf9156e201b2808006e16a9ce9eb9db6 Mon Sep 17 00:00:00 2001
From: Sayed Saeedi <sayed.saeedi@stud.th-deg.de>
Date: Sat, 2 Mar 2024 02:08:28 +0100
Subject: [PATCH] codes for preprocessing

---
 Preprocessing/All_the_datasets.ipynb | 209 +++++++++++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100644 Preprocessing/All_the_datasets.ipynb

diff --git a/Preprocessing/All_the_datasets.ipynb b/Preprocessing/All_the_datasets.ipynb
new file mode 100644
index 0000000..17455ea
--- /dev/null
+++ b/Preprocessing/All_the_datasets.ipynb
@@ -0,0 +1,209 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e35f5e6-c85e-4d41-a019-ded9fef99bae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c0aae72-18f5-48f2-a4ff-a039da52ff35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#loading all files from the location\n",
+    "pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines\n",
+    "#loading files\n",
+    "file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',\n",
+    "             '02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']\n",
+    "\n",
+    "#loading all dataset in a dictionary\n",
+    "dfs={}\n",
+    "for file in file_names:\n",
+    "    df=pd.read_csv(f'~/Datasets/{file}')\n",
+    "    dfs[file]=df\n",
+    "    \n",
+    "#Droping [\"Flow ID\", \"Src IP\", \"Src Port\", \"Dst IP\"] columns from 02-20-2018 file\n",
+    "dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_label_counts(dfs):\n",
+    "    \"\"\"\n",
+    "    counting the different categories in each Label\n",
+    "    Parameters:\n",
+    "    - dfs: Dictionary of DataFrames.\n",
+    "    \"\"\"\n",
+    "    for key in dfs.keys():\n",
+    "        df = dfs[key]  # Get the dataframe corresponding to the key\n",
+    "        count = df['Label'].value_counts()  # Perform value count on the 'Label' column\n",
+    "        print(f\"Value counts for dataframe '{key}':\\n{count}\\n\")\n",
+    "\n",
+    "print_label_counts(dfs)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fc6eb2d-8451-4cc4-9ae0-029182366994",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#preprocessing\n",
+    "\"\"\"\n",
+    "-Deleting duplicates\n",
+    "-changing inf and -inf to NaN\n",
+    "-changing time to unix format\n",
+    "-changing data types to numeric except for the 'Label' column\n",
+    "-dropping Na\n",
+    "-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']\n",
+    "\"\"\"\n",
+    "for key in dfs.keys():\n",
+    "    df=dfs[key]\n",
+    "    print(f\"Dataframe: '{key}', shape before preprocessing: {df.shape}\")\n",
+    "    df.drop_duplicates(inplace=True) \n",
+    "    df.replace([np.inf, -np.inf], np.nan, inplace=True) \n",
+    "    \n",
+    "    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n",
+    "    df['Timestamp'] = (df['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n",
+    "    \n",
+    "    for col in df.columns: #changing to numeric if not, then to NaN\n",
+    "        if df[col].dtype == 'object' and col != 'Label':\n",
+    "            df[col] = pd.to_numeric(df[col], errors='coerce')\n",
+    "\n",
+    "\n",
+    "    df.dropna(inplace=True)\n",
+    "\n",
+    "    for col in df.columns:\n",
+    "        if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:\n",
+    "            df.loc[df[col] < 0, col] = np.nan\n",
+    "\n",
+    "    df.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
+    "        \n",
+    "    df.dropna(inplace=True)\n",
+    "    print(f\"shape after preprocessing: {df.shape}\\n\") \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c3d70f5-7052-4e6c-bdce-fe36dff94008",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def aggregate_classes(dfs, classes):\n",
+    "    \"\"\"\n",
+    "    Aggregates traffic data into separate DataFrames based on specified labels.\n",
+    "\n",
+    "    Parameters:\n",
+    "    - dfs: Dictionary of DataFrames loaded from CSV files.\n",
+    "    - classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.\n",
+    "\n",
+    "    Returns:\n",
+    "    - A dictionary of aggregated DataFrames for each category.\n",
+    "    \"\"\"\n",
+    "    aggregated_data = {}\n",
+    "    \n",
+    "    for category, labels in classes.items():\n",
+    "        aggregated_data[category] = pd.DataFrame()\n",
+    "        \n",
+    "        for label in labels:\n",
+    "            # Iterating through all DataFrames to filter and aggregate the different labels\n",
+    "            for key in dfs:\n",
+    "                df = dfs[key]\n",
+    "                filtered_df = df[df[\"Label\"] == label]\n",
+    "                aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)\n",
+    "    \n",
+    "    return aggregated_data\n",
+    "\n",
+    "\n",
+    "# the specified labels\n",
+    "classes = {\n",
+    "    \"BruteForce\": [\"FTP-BruteForce\", \"SSH-Bruteforce\", \"Brute Force -Web\", \"Brute Force -XSS\"],\n",
+    "    \"DoS\": [\"DoS attacks-GoldenEye\", \"DoS attacks-Slowloris\", \"DoS attacks-Hulk\", \"DoS attacks-SlowHTTPTest\", \"DDoS attacks-LOIC-HTTP\", \"DDOS attack-HOIC\", \"DDOS attack-LOIC-UDP\"],\n",
+    "    \"Infiltration\": [\"Infilteration\"],\n",
+    "    \"Bot\": [\"Bot\"],\n",
+    "    \"Benign\": [\"Benign\"]\n",
+    "}\n",
+    "\n",
+    "aggregated_data = aggregate_classes(dfs, classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7062976-df28-43f5-8a86-226c87838123",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Access the aggregated DataFrames for each category\n",
+    "bruteforce_attacks = aggregated_data[\"BruteForce\"]\n",
+    "doS_attacks = aggregated_data[\"DoS\"]\n",
+    "infiltration_attacks = aggregated_data[\"Infiltration\"]\n",
+    "bot_attacks = aggregated_data[\"Bot\"]\n",
+    "benign = aggregated_data[\"Benign\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec01bd77-2f10-42fb-b48f-babe38204108",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Saving each traffic category\n",
+    "save_directory = 'Datasets/Preprocessed_Datasets'\n",
+    "\n",
+    "bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)\n",
+    "doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)\n",
+    "infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)\n",
+    "bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)\n",
+    "benign.to_csv(f'{save_directory}/benign.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8806c851-f3a6-4962-9c4b-0e2e226a6901",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab