From 5afcba295fc73eaba3ffaf676077ad38dbbda0ca Mon Sep 17 00:00:00 2001 From: Sayed Saeedi <sayed.saeedi@stud.th-deg.de> Date: Sat, 2 Mar 2024 02:10:16 +0100 Subject: [PATCH] Code for combining synthetic data and evaluating --- Models/Combining_everything.ipynb | 230 ++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 Models/Combining_everything.ipynb diff --git a/Models/Combining_everything.ipynb b/Models/Combining_everything.ipynb new file mode 100644 index 0000000..4d9fcf0 --- /dev/null +++ b/Models/Combining_everything.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "afdb8408-8328-49ec-95ad-1ad1b45217f8", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "from sdv.single_table import CTGANSynthesizer\n", + "from sdv.metadata import SingleTableMetadata\n", + "from sdmetrics.reports.single_table import QualityReport\n", + "from sdmetrics.reports.single_table import DiagnosticReport\n", + "from table_evaluator import TableEvaluator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1eb44865-7c03-46df-9780-aca27a6d0494", + "metadata": {}, + "outputs": [], + "source": [ + "#creating a combination of real data\n", + "\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n", + "# brute = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n", + "# goldenEye = real_data[real_data.Label=='DoS attacks-GoldenEye']\n", + "# slowloris = real_data[real_data.Label=='DoS attacks-Slowloris']\n", + "# hulk = real_data[real_data.Label=='DoS attacks-Hulk']\n", + "# hulk = hulk.iloc[:300000, :]\n", + "# slowHTTPtest = real_data[real_data.Label=='DoS attacks-SlowHTTPTest']\n", + "# loicHTTp = real_data[real_data.Label=='DDoS attacks-LOIC-HTTP']\n", + "# loicHTTp = loicHTTp.iloc[:300000, :]\n", + "# hoic = real_data[real_data.Label=='DDOS attack-HOIC']\n", + "# hoic = hoic.iloc[:300000, :]\n", + "# bot = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n", + "# infilteration = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n", + "# benign = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n", + "# benign = benign.iloc[:300000, :]\n", + "# ftp = brute[brute.Label=='FTP-BruteForce']\n", + "# ssh = brute[brute.Label=='SSH-Bruteforce']\n", + "\n", + "# real_all_in_one = pd.concat([goldenEye, slowloris, hulk, slowHTTPtest, loicHTTp, hoic, bot, infilteration, benign, ftp, ssh], ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79233ab4-949f-40bb-ae7b-f338d5076a13", + "metadata": {}, + "outputs": [], + "source": [ + "# reading all csv files in the directory\n", + "\n", + "folder_path = 'RTVAE_Results'\n", + "csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]\n", + "dfs = []\n", + "for file in csv_files:\n", + " file_path = os.path.join(folder_path, file)\n", + " dfs.append(pd.read_csv(file_path))\n", + " \n", + "synthetic_all_in_one = pd.concat(dfs, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40d00c4b-31a6-4a88-8b53-9d212d4f807e", + "metadata": {}, + "outputs": [], + "source": [ + "#savign real and synthetic data\n", + "# real_all_in_one.to_csv('TVAE_Results/real_all_in_one.csv', index=False)\n", + "synthetic_all_in_one.to_csv('RTVAE_Results/RTVAE_synthetic_all.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7e4c7ea-8b6d-4537-83ed-cda57173a704", + "metadata": {}, + "outputs": [], + "source": [ + "#ensuring that everthing went well\n", + "\n", + "synthetic_data = synthetic_all_in_one \n", + "print(synthetic_data.shape)\n", + "synthetic_data.dropna(inplace=True)\n", + "print(synthetic_data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9bd1d9-52d7-4703-890c-a2588a15b417", + "metadata": {}, + "outputs": [], + "source": [ + "#Loading real and sytnehtic data for evaluation\n", + "\n", + "real_data = pd.read_csv('TVAE_Results/real_all_in_one.csv')\n", + "synthetic_data = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "549bce4a-e6ea-457d-91a9-dfa9f95fe073", + "metadata": {}, + "outputs": [], + "source": [ + "def get_data_info(df):\n", + " \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n", + "\n", + " Args:\n", + " df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n", + "\n", + " Returns:\n", + " list: the list of categorical column names. Specifically, columns with only 4 uniques values\n", + " list: The list of continuous column names.\n", + " metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n", + " \"\"\"\n", + " #createing \n", + " categorical_columns = ['Label']\n", + " continuous_columns = []\n", + " for i in df.columns:\n", + " if i not in categorical_columns:\n", + " continuous_columns.append(i)\n", + " \n", + " #creating metadat\n", + " metadata = SingleTableMetadata()\n", + " metadata.detect_from_dataframe(df)\n", + " \n", + " for column in categorical_columns:\n", + " metadata.update_column(\n", + " column_name = column,\n", + " sdtype = 'categorical'\n", + " )\n", + " \n", + " for column in continuous_columns:\n", + " metadata.update_column(\n", + " column_name = column,\n", + " sdtype = 'numerical' \n", + " )\n", + " # validating metadata\n", + " metadata.validate()\n", + " metadata.validate_data(data=real_data)\n", + " \n", + " return categorical_columns, continuous_columns, metadata\n", + "\n", + "\n", + "categorical_columns, continuous_columns, metadata = get_data_info(real_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc6db1ff-e2fc-4817-87d1-22743468b7b8", + "metadata": {}, + "outputs": [], + "source": [ + "# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n", + "table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n", + "table_evaluator.visual_evaluation()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bc79e6f-0232-4e2b-9b19-b2fd8825a687", + "metadata": {}, + "outputs": [], + "source": [ + "#saving and visualizing column pair trend and column shapes\n", + "metadata = metadata.to_dict()\n", + "my_report = QualityReport()\n", + "my_report.generate(real_data, synthetic_data, metadata)\n", + "my_report.save(filepath='RTVAE_Results/quality.pkl')\n", + "my_report.get_visualization(property_name='Column Pair Trends')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9332967a-f289-454a-9e20-ff73c7f9bbcf", + "metadata": {}, + "outputs": [], + "source": [ + "#saving and visualiztation data validity\n", + "my_report = DiagnosticReport()\n", + "my_report.generate(real_data, synthetic_data, metadata)\n", + "my_report.save(filepath='RTVAE_Results/diagnostic.pkl')\n", + "my_report.get_visualization('Data Validity')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad3689d9-2361-4dfa-8efb-8d72d2859b7a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab