From 5afcba295fc73eaba3ffaf676077ad38dbbda0ca Mon Sep 17 00:00:00 2001
From: Sayed Saeedi <sayed.saeedi@stud.th-deg.de>
Date: Sat, 2 Mar 2024 02:10:16 +0100
Subject: [PATCH] Code for combining synthetic data and evaluating

---
 Models/Combining_everything.ipynb | 230 ++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 Models/Combining_everything.ipynb

diff --git a/Models/Combining_everything.ipynb b/Models/Combining_everything.ipynb
new file mode 100644
index 0000000..4d9fcf0
--- /dev/null
+++ b/Models/Combining_everything.ipynb
@@ -0,0 +1,230 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afdb8408-8328-49ec-95ad-1ad1b45217f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "from sdv.single_table import CTGANSynthesizer\n",
+    "from sdv.metadata import SingleTableMetadata\n",
+    "from sdmetrics.reports.single_table import QualityReport\n",
+    "from sdmetrics.reports.single_table import DiagnosticReport\n",
+    "from table_evaluator import TableEvaluator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1eb44865-7c03-46df-9780-aca27a6d0494",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#creating a combination of real data\n",
+    "\n",
+    "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n",
+    "# brute = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n",
+    "# goldenEye = real_data[real_data.Label=='DoS attacks-GoldenEye']\n",
+    "# slowloris = real_data[real_data.Label=='DoS attacks-Slowloris']\n",
+    "# hulk = real_data[real_data.Label=='DoS attacks-Hulk']\n",
+    "# hulk = hulk.iloc[:300000, :]\n",
+    "# slowHTTPtest = real_data[real_data.Label=='DoS attacks-SlowHTTPTest']\n",
+    "# loicHTTp = real_data[real_data.Label=='DDoS attacks-LOIC-HTTP']\n",
+    "# loicHTTp = loicHTTp.iloc[:300000, :]\n",
+    "# hoic = real_data[real_data.Label=='DDOS attack-HOIC']\n",
+    "# hoic = hoic.iloc[:300000, :]\n",
+    "# bot = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n",
+    "# infilteration = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n",
+    "# benign = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n",
+    "# benign = benign.iloc[:300000, :]\n",
+    "# ftp = brute[brute.Label=='FTP-BruteForce']\n",
+    "# ssh = brute[brute.Label=='SSH-Bruteforce']\n",
+    "\n",
+    "# real_all_in_one = pd.concat([goldenEye, slowloris, hulk, slowHTTPtest, loicHTTp, hoic, bot, infilteration, benign, ftp, ssh], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79233ab4-949f-40bb-ae7b-f338d5076a13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reading all csv files in the directory\n",
+    "\n",
+    "folder_path = 'RTVAE_Results'\n",
+    "csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]\n",
+    "dfs = []\n",
+    "for file in csv_files:\n",
+    "    file_path = os.path.join(folder_path, file)\n",
+    "    dfs.append(pd.read_csv(file_path))\n",
+    "    \n",
+    "synthetic_all_in_one = pd.concat(dfs, ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40d00c4b-31a6-4a88-8b53-9d212d4f807e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#savign real and synthetic data\n",
+    "# real_all_in_one.to_csv('TVAE_Results/real_all_in_one.csv', index=False)\n",
+    "synthetic_all_in_one.to_csv('RTVAE_Results/RTVAE_synthetic_all.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7e4c7ea-8b6d-4537-83ed-cda57173a704",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#ensuring that everthing went well\n",
+    "\n",
+    "synthetic_data = synthetic_all_in_one \n",
+    "print(synthetic_data.shape)\n",
+    "synthetic_data.dropna(inplace=True)\n",
+    "print(synthetic_data.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae9bd1d9-52d7-4703-890c-a2588a15b417",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Loading real and sytnehtic data for evaluation\n",
+    "\n",
+    "real_data = pd.read_csv('TVAE_Results/real_all_in_one.csv')\n",
+    "synthetic_data = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "549bce4a-e6ea-457d-91a9-dfa9f95fe073",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_data_info(df):\n",
+    "    \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n",
+    "\n",
+    "    Args:\n",
+    "        df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n",
+    "\n",
+    "    Returns:\n",
+    "        list: the list of categorical column names. Specifically, columns with only 4 uniques values\n",
+    "        list: The list of continuous column names.\n",
+    "        metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n",
+    "    \"\"\"\n",
+    "    #createing \n",
+    "    categorical_columns = ['Label']\n",
+    "    continuous_columns = []\n",
+    "    for i in df.columns:\n",
+    "        if i not in categorical_columns:\n",
+    "            continuous_columns.append(i)\n",
+    "    \n",
+    "    #creating metadat\n",
+    "    metadata = SingleTableMetadata()\n",
+    "    metadata.detect_from_dataframe(df)\n",
+    "    \n",
+    "    for column in categorical_columns:\n",
+    "        metadata.update_column(\n",
+    "            column_name = column,\n",
+    "            sdtype = 'categorical'\n",
+    "        )\n",
+    "    \n",
+    "    for column in continuous_columns:\n",
+    "        metadata.update_column(\n",
+    "            column_name = column,\n",
+    "            sdtype = 'numerical'  \n",
+    "        )\n",
+    "    # validating metadata\n",
+    "    metadata.validate()\n",
+    "    metadata.validate_data(data=real_data)\n",
+    "    \n",
+    "    return categorical_columns, continuous_columns, metadata\n",
+    "\n",
+    "\n",
+    "categorical_columns, continuous_columns, metadata = get_data_info(real_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc6db1ff-e2fc-4817-87d1-22743468b7b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n",
+    "table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n",
+    "table_evaluator.visual_evaluation()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc79e6f-0232-4e2b-9b19-b2fd8825a687",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#saving and visualizing column pair trend and column shapes\n",
+    "metadata = metadata.to_dict()\n",
+    "my_report = QualityReport()\n",
+    "my_report.generate(real_data, synthetic_data, metadata)\n",
+    "my_report.save(filepath='RTVAE_Results/quality.pkl')\n",
+    "my_report.get_visualization(property_name='Column Pair Trends')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9332967a-f289-454a-9e20-ff73c7f9bbcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#saving and visualiztation data validity\n",
+    "my_report = DiagnosticReport()\n",
+    "my_report.generate(real_data, synthetic_data, metadata)\n",
+    "my_report.save(filepath='RTVAE_Results/diagnostic.pkl')\n",
+    "my_report.get_visualization('Data Validity')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad3689d9-2361-4dfa-8efb-8d72d2859b7a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab