{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "6e35f5e6-c85e-4d41-a019-ded9fef99bae", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "id": "5c0aae72-18f5-48f2-a4ff-a039da52ff35", "metadata": {}, "outputs": [], "source": [ "#loading all files from the location\n", "pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines\n", "#loading files\n", "file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',\n", " '02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']\n", "\n", "#loading all dataset in a dictionary\n", "dfs={}\n", "for file in file_names:\n", " df=pd.read_csv(f'~/Datasets/{file}')\n", " dfs[file]=df\n", " \n", "#Droping [\"Flow ID\", \"Src IP\", \"Src Port\", \"Dst IP\"] columns from 02-20-2018 file\n", "dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3", "metadata": {}, "outputs": [], "source": [ "def print_label_counts(dfs):\n", " \"\"\"\n", " counting the different categories in each Label\n", " Parameters:\n", " - dfs: Dictionary of DataFrames.\n", " \"\"\"\n", " for key in dfs.keys():\n", " df = dfs[key] # Get the dataframe corresponding to the key\n", " count = df['Label'].value_counts() # Perform value count on the 'Label' column\n", " print(f\"Value counts for dataframe '{key}':\\n{count}\\n\")\n", "\n", "print_label_counts(dfs)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6fc6eb2d-8451-4cc4-9ae0-029182366994", "metadata": {}, "outputs": [], "source": [ "#preprocessing\n", "\"\"\"\n", "-Deleting duplicates\n", "-changing inf and -inf to NaN\n", "-changing time to unix format\n", "-changing data types to numeric except for the 'Label' column\n", "-dropping Na\n", "-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']\n", "\"\"\"\n", "for key in dfs.keys():\n", " df=dfs[key]\n", " print(f\"Dataframe: '{key}', shape before preprocessing: {df.shape}\")\n", " df.drop_duplicates(inplace=True) \n", " df.replace([np.inf, -np.inf], np.nan, inplace=True) \n", " \n", " df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n", " df['Timestamp'] = (df['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n", " \n", " for col in df.columns: #changing to numeric if not, then to NaN\n", " if df[col].dtype == 'object' and col != 'Label':\n", " df[col] = pd.to_numeric(df[col], errors='coerce')\n", "\n", "\n", " df.dropna(inplace=True)\n", "\n", " for col in df.columns:\n", " if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:\n", " df.loc[df[col] < 0, col] = np.nan\n", "\n", " df.replace([np.inf, -np.inf], np.nan, inplace=True)\n", " \n", " df.dropna(inplace=True)\n", " print(f\"shape after preprocessing: {df.shape}\\n\") \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1c3d70f5-7052-4e6c-bdce-fe36dff94008", "metadata": {}, "outputs": [], "source": [ "def aggregate_classes(dfs, classes):\n", " \"\"\"\n", " Aggregates traffic data into separate DataFrames based on specified labels.\n", "\n", " Parameters:\n", " - dfs: Dictionary of DataFrames loaded from CSV files.\n", " - classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.\n", "\n", " Returns:\n", " - A dictionary of aggregated DataFrames for each category.\n", " \"\"\"\n", " aggregated_data = {}\n", " \n", " for category, labels in classes.items():\n", " aggregated_data[category] = pd.DataFrame()\n", " \n", " for label in labels:\n", " # Iterating through all DataFrames to filter and aggregate the different labels\n", " for key in dfs:\n", " df = dfs[key]\n", " filtered_df = df[df[\"Label\"] == label]\n", " aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)\n", " \n", " return aggregated_data\n", "\n", "\n", "# the specified labels\n", "classes = {\n", " \"BruteForce\": [\"FTP-BruteForce\", \"SSH-Bruteforce\", \"Brute Force -Web\", \"Brute Force -XSS\"],\n", " \"DoS\": [\"DoS attacks-GoldenEye\", \"DoS attacks-Slowloris\", \"DoS attacks-Hulk\", \"DoS attacks-SlowHTTPTest\", \"DDoS attacks-LOIC-HTTP\", \"DDOS attack-HOIC\", \"DDOS attack-LOIC-UDP\"],\n", " \"Infiltration\": [\"Infilteration\"],\n", " \"Bot\": [\"Bot\"],\n", " \"Benign\": [\"Benign\"]\n", "}\n", "\n", "aggregated_data = aggregate_classes(dfs, classes)" ] }, { "cell_type": "code", "execution_count": null, "id": "b7062976-df28-43f5-8a86-226c87838123", "metadata": {}, "outputs": [], "source": [ "# Access the aggregated DataFrames for each category\n", "bruteforce_attacks = aggregated_data[\"BruteForce\"]\n", "doS_attacks = aggregated_data[\"DoS\"]\n", "infiltration_attacks = aggregated_data[\"Infiltration\"]\n", "bot_attacks = aggregated_data[\"Bot\"]\n", "benign = aggregated_data[\"Benign\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "ec01bd77-2f10-42fb-b48f-babe38204108", "metadata": {}, "outputs": [], "source": [ "#Saving each traffic category\n", "save_directory = 'Datasets/Preprocessed_Datasets'\n", "\n", "bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)\n", "doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)\n", "infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)\n", "bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)\n", "benign.to_csv(f'{save_directory}/benign.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "8806c851-f3a6-4962-9c4b-0e2e226a6901", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }