codes for preprocessing

23191bda · Sayed Saeedi · 39404018 · 23191bda
Commit 23191bda authored 1 year ago by Sayed Saeedi
--- a/Preprocessing/All_the_datasets.ipynb
+++ b/Preprocessing/All_the_datasets.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e35f5e6-c85e-4d41-a019-ded9fef99bae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c0aae72-18f5-48f2-a4ff-a039da52ff35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#loading all files from the location\n",
+    "pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines\n",
+    "#loading files\n",
+    "file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',\n",
+    "             '02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']\n",
+    "\n",
+    "#loading all dataset in a dictionary\n",
+    "dfs={}\n",
+    "for file in file_names:\n",
+    "    df=pd.read_csv(f'~/Datasets/{file}')\n",
+    "    dfs[file]=df\n",
+    "    \n",
+    "#Droping [\"Flow ID\", \"Src IP\", \"Src Port\", \"Dst IP\"] columns from 02-20-2018 file\n",
+    "dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_label_counts(dfs):\n",
+    "    \"\"\"\n",
+    "    counting the different categories in each Label\n",
+    "    Parameters:\n",
+    "    - dfs: Dictionary of DataFrames.\n",
+    "    \"\"\"\n",
+    "    for key in dfs.keys():\n",
+    "        df = dfs[key]  # Get the dataframe corresponding to the key\n",
+    "        count = df['Label'].value_counts()  # Perform value count on the 'Label' column\n",
+    "        print(f\"Value counts for dataframe '{key}':\\n{count}\\n\")\n",
+    "\n",
+    "print_label_counts(dfs)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fc6eb2d-8451-4cc4-9ae0-029182366994",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#preprocessing\n",
+    "\"\"\"\n",
+    "-Deleting duplicates\n",
+    "-changing inf and -inf to NaN\n",
+    "-changing time to unix format\n",
+    "-changing data types to numeric except for the 'Label' column\n",
+    "-dropping Na\n",
+    "-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']\n",
+    "\"\"\"\n",
+    "for key in dfs.keys():\n",
+    "    df=dfs[key]\n",
+    "    print(f\"Dataframe: '{key}', shape before preprocessing: {df.shape}\")\n",
+    "    df.drop_duplicates(inplace=True) \n",
+    "    df.replace([np.inf, -np.inf], np.nan, inplace=True) \n",
+    "    \n",
+    "    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n",
+    "    df['Timestamp'] = (df['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n",
+    "    \n",
+    "    for col in df.columns: #changing to numeric if not, then to NaN\n",
+    "        if df[col].dtype == 'object' and col != 'Label':\n",
+    "            df[col] = pd.to_numeric(df[col], errors='coerce')\n",
+    "\n",
+    "\n",
+    "    df.dropna(inplace=True)\n",
+    "\n",
+    "    for col in df.columns:\n",
+    "        if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:\n",
+    "            df.loc[df[col] < 0, col] = np.nan\n",
+    "\n",
+    "    df.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
+    "        \n",
+    "    df.dropna(inplace=True)\n",
+    "    print(f\"shape after preprocessing: {df.shape}\\n\") \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c3d70f5-7052-4e6c-bdce-fe36dff94008",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def aggregate_classes(dfs, classes):\n",
+    "    \"\"\"\n",
+    "    Aggregates traffic data into separate DataFrames based on specified labels.\n",
+    "\n",
+    "    Parameters:\n",
+    "    - dfs: Dictionary of DataFrames loaded from CSV files.\n",
+    "    - classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.\n",
+    "\n",
+    "    Returns:\n",
+    "    - A dictionary of aggregated DataFrames for each category.\n",
+    "    \"\"\"\n",
+    "    aggregated_data = {}\n",
+    "    \n",
+    "    for category, labels in classes.items():\n",
+    "        aggregated_data[category] = pd.DataFrame()\n",
+    "        \n",
+    "        for label in labels:\n",
+    "            # Iterating through all DataFrames to filter and aggregate the different labels\n",
+    "            for key in dfs:\n",
+    "                df = dfs[key]\n",
+    "                filtered_df = df[df[\"Label\"] == label]\n",
+    "                aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)\n",
+    "    \n",
+    "    return aggregated_data\n",
+    "\n",
+    "\n",
+    "# the specified labels\n",
+    "classes = {\n",
+    "    \"BruteForce\": [\"FTP-BruteForce\", \"SSH-Bruteforce\", \"Brute Force -Web\", \"Brute Force -XSS\"],\n",
+    "    \"DoS\": [\"DoS attacks-GoldenEye\", \"DoS attacks-Slowloris\", \"DoS attacks-Hulk\", \"DoS attacks-SlowHTTPTest\", \"DDoS attacks-LOIC-HTTP\", \"DDOS attack-HOIC\", \"DDOS attack-LOIC-UDP\"],\n",
+    "    \"Infiltration\": [\"Infilteration\"],\n",
+    "    \"Bot\": [\"Bot\"],\n",
+    "    \"Benign\": [\"Benign\"]\n",
+    "}\n",
+    "\n",
+    "aggregated_data = aggregate_classes(dfs, classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7062976-df28-43f5-8a86-226c87838123",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Access the aggregated DataFrames for each category\n",
+    "bruteforce_attacks = aggregated_data[\"BruteForce\"]\n",
+    "doS_attacks = aggregated_data[\"DoS\"]\n",
+    "infiltration_attacks = aggregated_data[\"Infiltration\"]\n",
+    "bot_attacks = aggregated_data[\"Bot\"]\n",
+    "benign = aggregated_data[\"Benign\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec01bd77-2f10-42fb-b48f-babe38204108",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Saving each traffic category\n",
+    "save_directory = 'Datasets/Preprocessed_Datasets'\n",
+    "\n",
+    "bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)\n",
+    "doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)\n",
+    "infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)\n",
+    "bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)\n",
+    "benign.to_csv(f'{save_directory}/benign.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8806c851-f3a6-4962-9c4b-0e2e226a6901",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:6e35f5e6-c85e-4d41-a019-ded9fef99bae tags:
+
+``` python
+import pandas as pd
+import numpy as np
+import os
+```
+
+%% Cell type:code id:5c0aae72-18f5-48f2-a4ff-a039da52ff35 tags:
+
+``` python
+#loading all files from the location
+pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines
+#loading files
+file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',
+             '02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']
+
+#loading all dataset in a dictionary
+dfs={}
+for file in file_names:
+    df=pd.read_csv(f'~/Datasets/{file}')
+    dfs[file]=df
+
+#Droping ["Flow ID", "Src IP", "Src Port", "Dst IP"] columns from 02-20-2018 file
+dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)
+```
+
+%% Cell type:code id:ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3 tags:
+
+``` python
+def print_label_counts(dfs):
+    """
+    counting the different categories in each Label
+    Parameters:
+    - dfs: Dictionary of DataFrames.
+    """
+    for key in dfs.keys():
+        df = dfs[key]  # Get the dataframe corresponding to the key
+        count = df['Label'].value_counts()  # Perform value count on the 'Label' column
+        print(f"Value counts for dataframe '{key}':\n{count}\n")
+
+print_label_counts(dfs)
+```
+
+%% Cell type:code id:6fc6eb2d-8451-4cc4-9ae0-029182366994 tags:
+
+``` python
+#preprocessing
+"""
+-Deleting duplicates
+-changing inf and -inf to NaN
+-changing time to unix format
+-changing data types to numeric except for the 'Label' column
+-dropping Na
+-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']
+"""
+for key in dfs.keys():
+    df=dfs[key]
+    print(f"Dataframe: '{key}', shape before preprocessing: {df.shape}")
+    df.drop_duplicates(inplace=True)
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
+    df['Timestamp'] = (df['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
+
+    for col in df.columns: #changing to numeric if not, then to NaN
+        if df[col].dtype == 'object' and col != 'Label':
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+
+
+    df.dropna(inplace=True)
+
+    for col in df.columns:
+        if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:
+            df.loc[df[col] < 0, col] = np.nan
+
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    df.dropna(inplace=True)
+    print(f"shape after preprocessing: {df.shape}\n")
+
+```
+
+%% Cell type:code id:1c3d70f5-7052-4e6c-bdce-fe36dff94008 tags:
+
+``` python
+def aggregate_classes(dfs, classes):
+    """
+    Aggregates traffic data into separate DataFrames based on specified labels.
+
+    Parameters:
+    - dfs: Dictionary of DataFrames loaded from CSV files.
+    - classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.
+
+    Returns:
+    - A dictionary of aggregated DataFrames for each category.
+    """
+    aggregated_data = {}
+
+    for category, labels in classes.items():
+        aggregated_data[category] = pd.DataFrame()
+
+        for label in labels:
+            # Iterating through all DataFrames to filter and aggregate the different labels
+            for key in dfs:
+                df = dfs[key]
+                filtered_df = df[df["Label"] == label]
+                aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)
+
+    return aggregated_data
+
+
+# the specified labels
+classes = {
+    "BruteForce": ["FTP-BruteForce", "SSH-Bruteforce", "Brute Force -Web", "Brute Force -XSS"],
+    "DoS": ["DoS attacks-GoldenEye", "DoS attacks-Slowloris", "DoS attacks-Hulk", "DoS attacks-SlowHTTPTest", "DDoS attacks-LOIC-HTTP", "DDOS attack-HOIC", "DDOS attack-LOIC-UDP"],
+    "Infiltration": ["Infilteration"],
+    "Bot": ["Bot"],
+    "Benign": ["Benign"]
+}
+
+aggregated_data = aggregate_classes(dfs, classes)
+```
+
+%% Cell type:code id:b7062976-df28-43f5-8a86-226c87838123 tags:
+
+``` python
+# Access the aggregated DataFrames for each category
+bruteforce_attacks = aggregated_data["BruteForce"]
+doS_attacks = aggregated_data["DoS"]
+infiltration_attacks = aggregated_data["Infiltration"]
+bot_attacks = aggregated_data["Bot"]
+benign = aggregated_data["Benign"]
+```
+
+%% Cell type:code id:ec01bd77-2f10-42fb-b48f-babe38204108 tags:
+
+``` python
+#Saving each traffic category
+save_directory = 'Datasets/Preprocessed_Datasets'
+
+bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)
+doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)
+infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)
+bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)
+benign.to_csv(f'{save_directory}/benign.csv', index=False)
+```
+
+%% Cell type:code id:8806c851-f3a6-4962-9c4b-0e2e226a6901 tags:
+
+``` python
+```