{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e35f5e6-c85e-4d41-a019-ded9fef99bae",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c0aae72-18f5-48f2-a4ff-a039da52ff35",
   "metadata": {},
   "outputs": [],
   "source": [
    "#loading all files from the location\n",
    "pd.set_option('display.max_rows', None, 'display.max_columns', None) # display unlimited number of lines\n",
    "#loading files\n",
    "file_names = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv',\n",
    "             '02-28-2018.csv', '03-01-2018.csv', '03-02-2018.csv']\n",
    "\n",
    "#loading all dataset in a dictionary\n",
    "dfs={}\n",
    "for file in file_names:\n",
    "    df=pd.read_csv(f'~/Datasets/{file}')\n",
    "    dfs[file]=df\n",
    "    \n",
    "#Droping [\"Flow ID\", \"Src IP\", \"Src Port\", \"Dst IP\"] columns from 02-20-2018 file\n",
    "dfs['02-20-2018.csv'].drop(['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad3d8a3d-1043-4d2a-82dd-0ddd20f615f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_label_counts(dfs):\n",
    "    \"\"\"\n",
    "    counting the different categories in each Label\n",
    "    Parameters:\n",
    "    - dfs: Dictionary of DataFrames.\n",
    "    \"\"\"\n",
    "    for key in dfs.keys():\n",
    "        df = dfs[key]  # Get the dataframe corresponding to the key\n",
    "        count = df['Label'].value_counts()  # Perform value count on the 'Label' column\n",
    "        print(f\"Value counts for dataframe '{key}':\\n{count}\\n\")\n",
    "\n",
    "print_label_counts(dfs)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fc6eb2d-8451-4cc4-9ae0-029182366994",
   "metadata": {},
   "outputs": [],
   "source": [
    "#preprocessing\n",
    "\"\"\"\n",
    "-Deleting duplicates\n",
    "-changing inf and -inf to NaN\n",
    "-changing time to unix format\n",
    "-changing data types to numeric except for the 'Label' column\n",
    "-dropping Na\n",
    "-dropping negative values except ['Init Bwd Win Byts', 'Init Fwd Win Byts']\n",
    "\"\"\"\n",
    "for key in dfs.keys():\n",
    "    df=dfs[key]\n",
    "    print(f\"Dataframe: '{key}', shape before preprocessing: {df.shape}\")\n",
    "    df.drop_duplicates(inplace=True) \n",
    "    df.replace([np.inf, -np.inf], np.nan, inplace=True) \n",
    "    \n",
    "    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n",
    "    df['Timestamp'] = (df['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n",
    "    \n",
    "    for col in df.columns: #changing to numeric if not, then to NaN\n",
    "        if df[col].dtype == 'object' and col != 'Label':\n",
    "            df[col] = pd.to_numeric(df[col], errors='coerce')\n",
    "\n",
    "\n",
    "    df.dropna(inplace=True)\n",
    "\n",
    "    for col in df.columns:\n",
    "        if col not in ['Init Bwd Win Byts', 'Init Fwd Win Byts', 'Label']:\n",
    "            df.loc[df[col] < 0, col] = np.nan\n",
    "\n",
    "    df.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
    "        \n",
    "    df.dropna(inplace=True)\n",
    "    print(f\"shape after preprocessing: {df.shape}\\n\") \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c3d70f5-7052-4e6c-bdce-fe36dff94008",
   "metadata": {},
   "outputs": [],
   "source": [
    "def aggregate_classes(dfs, classes):\n",
    "    \"\"\"\n",
    "    Aggregates traffic data into separate DataFrames based on specified labels.\n",
    "\n",
    "    Parameters:\n",
    "    - dfs: Dictionary of DataFrames loaded from CSV files.\n",
    "    - classes: Dictionary with keys being traffic category and values being lists of labels associated with that category.\n",
    "\n",
    "    Returns:\n",
    "    - A dictionary of aggregated DataFrames for each category.\n",
    "    \"\"\"\n",
    "    aggregated_data = {}\n",
    "    \n",
    "    for category, labels in classes.items():\n",
    "        aggregated_data[category] = pd.DataFrame()\n",
    "        \n",
    "        for label in labels:\n",
    "            # Iterating through all DataFrames to filter and aggregate the different labels\n",
    "            for key in dfs:\n",
    "                df = dfs[key]\n",
    "                filtered_df = df[df[\"Label\"] == label]\n",
    "                aggregated_data[category] = pd.concat([aggregated_data[category], filtered_df], axis=0, ignore_index=True)\n",
    "    \n",
    "    return aggregated_data\n",
    "\n",
    "\n",
    "# the specified labels\n",
    "classes = {\n",
    "    \"BruteForce\": [\"FTP-BruteForce\", \"SSH-Bruteforce\", \"Brute Force -Web\", \"Brute Force -XSS\"],\n",
    "    \"DoS\": [\"DoS attacks-GoldenEye\", \"DoS attacks-Slowloris\", \"DoS attacks-Hulk\", \"DoS attacks-SlowHTTPTest\", \"DDoS attacks-LOIC-HTTP\", \"DDOS attack-HOIC\", \"DDOS attack-LOIC-UDP\"],\n",
    "    \"Infiltration\": [\"Infilteration\"],\n",
    "    \"Bot\": [\"Bot\"],\n",
    "    \"Benign\": [\"Benign\"]\n",
    "}\n",
    "\n",
    "aggregated_data = aggregate_classes(dfs, classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7062976-df28-43f5-8a86-226c87838123",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Access the aggregated DataFrames for each category\n",
    "bruteforce_attacks = aggregated_data[\"BruteForce\"]\n",
    "doS_attacks = aggregated_data[\"DoS\"]\n",
    "infiltration_attacks = aggregated_data[\"Infiltration\"]\n",
    "bot_attacks = aggregated_data[\"Bot\"]\n",
    "benign = aggregated_data[\"Benign\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec01bd77-2f10-42fb-b48f-babe38204108",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Saving each traffic category\n",
    "save_directory = 'Datasets/Preprocessed_Datasets'\n",
    "\n",
    "bruteforce_attacks.to_csv(f'{save_directory}/bruteforce_attacks.csv', index=False)\n",
    "doS_attacks.to_csv(f'{save_directory}/doS_attacks.csv', index=False)\n",
    "infiltration_attacks.to_csv(f'{save_directory}/infiltration_attacks.csv', index=False)\n",
    "bot_attacks.to_csv(f'{save_directory}/bot_attacks.csv', index=False)\n",
    "benign.to_csv(f'{save_directory}/benign.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8806c851-f3a6-4962-9c4b-0e2e226a6901",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}