{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "afdb8408-8328-49ec-95ad-1ad1b45217f8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "from sdv.single_table import CTGANSynthesizer\n", "from sdv.metadata import SingleTableMetadata\n", "from sdmetrics.reports.single_table import QualityReport\n", "from sdmetrics.reports.single_table import DiagnosticReport\n", "from table_evaluator import TableEvaluator" ] }, { "cell_type": "code", "execution_count": null, "id": "1eb44865-7c03-46df-9780-aca27a6d0494", "metadata": {}, "outputs": [], "source": [ "#creating a combination of real data\n", "\n", "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n", "# brute = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n", "# goldenEye = real_data[real_data.Label=='DoS attacks-GoldenEye']\n", "# slowloris = real_data[real_data.Label=='DoS attacks-Slowloris']\n", "# hulk = real_data[real_data.Label=='DoS attacks-Hulk']\n", "# hulk = hulk.iloc[:300000, :]\n", "# slowHTTPtest = real_data[real_data.Label=='DoS attacks-SlowHTTPTest']\n", "# loicHTTp = real_data[real_data.Label=='DDoS attacks-LOIC-HTTP']\n", "# loicHTTp = loicHTTp.iloc[:300000, :]\n", "# hoic = real_data[real_data.Label=='DDOS attack-HOIC']\n", "# hoic = hoic.iloc[:300000, :]\n", "# bot = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n", "# infilteration = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n", "# benign = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n", "# benign = benign.iloc[:300000, :]\n", "# ftp = brute[brute.Label=='FTP-BruteForce']\n", "# ssh = brute[brute.Label=='SSH-Bruteforce']\n", "\n", "# real_all_in_one = pd.concat([goldenEye, slowloris, hulk, slowHTTPtest, loicHTTp, hoic, bot, infilteration, benign, ftp, ssh], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "79233ab4-949f-40bb-ae7b-f338d5076a13", "metadata": {}, "outputs": [], "source": [ "# reading all csv files in the directory\n", "\n", "folder_path = 'RTVAE_Results'\n", "csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]\n", "dfs = []\n", "for file in csv_files:\n", " file_path = os.path.join(folder_path, file)\n", " dfs.append(pd.read_csv(file_path))\n", " \n", "synthetic_all_in_one = pd.concat(dfs, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "40d00c4b-31a6-4a88-8b53-9d212d4f807e", "metadata": {}, "outputs": [], "source": [ "#savign real and synthetic data\n", "# real_all_in_one.to_csv('TVAE_Results/real_all_in_one.csv', index=False)\n", "synthetic_all_in_one.to_csv('RTVAE_Results/RTVAE_synthetic_all.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "a7e4c7ea-8b6d-4537-83ed-cda57173a704", "metadata": {}, "outputs": [], "source": [ "#ensuring that everthing went well\n", "\n", "synthetic_data = synthetic_all_in_one \n", "print(synthetic_data.shape)\n", "synthetic_data.dropna(inplace=True)\n", "print(synthetic_data.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "ae9bd1d9-52d7-4703-890c-a2588a15b417", "metadata": {}, "outputs": [], "source": [ "#Loading real and sytnehtic data for evaluation\n", "\n", "real_data = pd.read_csv('TVAE_Results/real_all_in_one.csv')\n", "synthetic_data = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "549bce4a-e6ea-457d-91a9-dfa9f95fe073", "metadata": {}, "outputs": [], "source": [ "def get_data_info(df):\n", " \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n", "\n", " Args:\n", " df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n", "\n", " Returns:\n", " list: the list of categorical column names. Specifically, columns with only 4 uniques values\n", " list: The list of continuous column names.\n", " metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n", " \"\"\"\n", " #createing \n", " categorical_columns = ['Label']\n", " continuous_columns = []\n", " for i in df.columns:\n", " if i not in categorical_columns:\n", " continuous_columns.append(i)\n", " \n", " #creating metadat\n", " metadata = SingleTableMetadata()\n", " metadata.detect_from_dataframe(df)\n", " \n", " for column in categorical_columns:\n", " metadata.update_column(\n", " column_name = column,\n", " sdtype = 'categorical'\n", " )\n", " \n", " for column in continuous_columns:\n", " metadata.update_column(\n", " column_name = column,\n", " sdtype = 'numerical' \n", " )\n", " # validating metadata\n", " metadata.validate()\n", " metadata.validate_data(data=real_data)\n", " \n", " return categorical_columns, continuous_columns, metadata\n", "\n", "\n", "categorical_columns, continuous_columns, metadata = get_data_info(real_data)" ] }, { "cell_type": "code", "execution_count": null, "id": "fc6db1ff-e2fc-4817-87d1-22743468b7b8", "metadata": {}, "outputs": [], "source": [ "# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n", "table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n", "table_evaluator.visual_evaluation()" ] }, { "cell_type": "code", "execution_count": null, "id": "7bc79e6f-0232-4e2b-9b19-b2fd8825a687", "metadata": {}, "outputs": [], "source": [ "#saving and visualizing column pair trend and column shapes\n", "metadata = metadata.to_dict()\n", "my_report = QualityReport()\n", "my_report.generate(real_data, synthetic_data, metadata)\n", "my_report.save(filepath='RTVAE_Results/quality.pkl')\n", "my_report.get_visualization(property_name='Column Pair Trends')" ] }, { "cell_type": "code", "execution_count": null, "id": "9332967a-f289-454a-9e20-ff73c7f9bbcf", "metadata": {}, "outputs": [], "source": [ "#saving and visualiztation data validity\n", "my_report = DiagnosticReport()\n", "my_report.generate(real_data, synthetic_data, metadata)\n", "my_report.save(filepath='RTVAE_Results/diagnostic.pkl')\n", "my_report.get_visualization('Data Validity')" ] }, { "cell_type": "code", "execution_count": null, "id": "ad3689d9-2361-4dfa-8efb-8d72d2859b7a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }