From d7ddec2583e67577c3d84e78248dfd4e55738a7d Mon Sep 17 00:00:00 2001 From: Sayed Saeedi <sayed.saeedi@stud.th-deg.de> Date: Sat, 2 Mar 2024 02:06:03 +0100 Subject: [PATCH] codes for CTGAN, CopulaGAN, and TVAE --- Models/SDV.ipynb | 288 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 Models/SDV.ipynb diff --git a/Models/SDV.ipynb b/Models/SDV.ipynb new file mode 100644 index 0000000..2c0a534 --- /dev/null +++ b/Models/SDV.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aea43308-a588-4730-a10e-48156b0d1aa5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sdv.single_table import CTGANSynthesizer\n", + "from sdv.single_table import TVAESynthesizer \n", + "from sdv.single_table import CopulaGANSynthesizer\n", + "from sdv.metadata import SingleTableMetadata\n", + "from sdmetrics.reports.single_table import QualityReport\n", + "from sdmetrics.reports.single_table import DiagnosticReport\n", + "from table_evaluator import TableEvaluator\n", + "import matplotlib.pyplot as plt\n", + "from sdmetrics.single_column import StatisticSimilarity\n", + "import math\n", + "from sdmetrics.single_column import RangeCoverage\n", + "from sdmetrics.visualization import get_column_plot\n", + "import os\n", + "import plotly.io as py\n", + "import string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6e0f097-891b-42fc-9e20-85145c8d24ac", + "metadata": {}, + "outputs": [], + "source": [ + "#loading the preprocessed datasets \n", + "\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n", + "\n", + "print(real_data.shape)\n", + "print(real_data.Label.unique())\n", + "\n", + "# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly\n", + "#real_data=real_data[real_data.Label=='DoS attacks-Hulk'] # change according to the dataset\n", + "real_data = real_data.iloc[:300000, :]\n", + "print(real_data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc6915d1-0c8a-4d7e-8b87-0c56ae9c6431", + "metadata": {}, + "outputs": [], + "source": [ + "# Categorical columns & Continuous columns\n", + "def get_data_info(df):\n", + " \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n", + "\n", + " Args:\n", + " df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n", + "\n", + " Returns:\n", + " list: the list of categorical column names. Specifically, columns with only 4 uniques values\n", + " list: The list of continuous column names.\n", + " metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n", + " \"\"\"\n", + " #createing \n", + " categorical_columns = ['Label']\n", + " continuous_columns = []\n", + "\n", + " for i in df.columns:\n", + " if i not in categorical_columns:\n", + " continuous_columns.append(i)\n", + " \n", + " #creating metadat\n", + " metadata = SingleTableMetadata()\n", + " metadata.detect_from_dataframe(df)\n", + " \n", + " for column in categorical_columns:\n", + " metadata.update_column(\n", + " column_name = column,\n", + " sdtype = 'categorical'\n", + " )\n", + " \n", + " for column in continuous_columns:\n", + " metadata.update_column(\n", + " column_name = column,\n", + " sdtype = 'numerical' \n", + " )\n", + " # validating metadata\n", + " metadata.validate()\n", + " metadata.validate_data(data=real_data)\n", + " \n", + " return categorical_columns, continuous_columns, metadata\n", + "\n", + "\n", + "categorical_columns, continuous_columns, metadata = get_data_info(real_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd625bf1-52ed-4bda-9e69-b62009c0422f", + "metadata": {}, + "outputs": [], + "source": [ + "#fiting the synthesizer\n", + "# for more info check https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers\n", + "#availabel options CTGANSynthesizer, CopulaGANSynthesizer, and TVAESynthesizer\n", + "\n", + "#uncomment below for CTGANSynthesizer and CopulaGANSynthesizer\n", + "# synthesizer = CTGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=128, generator_dim=(256, 256), \n", + "# discriminator_dim=(256, 256), generator_lr=0.000001, generator_decay=0.000001, epochs=500, discriminator_lr=0.000001, \n", + "# discriminator_decay=0.000001, batch_size=300, discriminator_steps=3, log_frequency=True, verbose=True, pac=10)\n", + "\n", + "\n", + "#uncommnet below for TVAESynthesizer\n", + "# synthesizer = TVAESynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=100, compress_dims=(128, 128), \n", + "# decompress_dims=(128, 128), l2scale=0.000001, batch_size=500, epochs=500, loss_factor=2, cuda=True)\n", + "\n", + "\n", + "\n", + "synthesizer.fit(real_data)\n", + "synthesizer.save(filepath='CopulaGAN_Results/Hulk/CopulaGAN.pkl') # change the path accordingly\n", + "synthetic_data = synthesizer.sample(300000) # change the instances you want to be genereated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60019781-cf26-4a21-a1b5-0fd099f07972", + "metadata": {}, + "outputs": [], + "source": [ + "# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n", + "table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n", + "table_evaluator.visual_evaluation()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "273ea89c-161b-4c73-ac96-412effb4e8bb", + "metadata": {}, + "outputs": [], + "source": [ + "#saving and visualizing column pair trend and column shapes\n", + "metadata = metadata.to_dict()\n", + "my_report = QualityReport()\n", + "my_report.generate(real_data, synthetic_data, metadata)\n", + "my_report.save(filepath='CopulaGAN_Results/Hulk/quality.pkl')\n", + "my_report.get_visualization(property_name='Column Pair Trends')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "169f33e3-f1e3-4677-b092-742db80d9aa6", + "metadata": {}, + "outputs": [], + "source": [ + "#saving and visualiztation data validity\n", + "my_report = DiagnosticReport()\n", + "my_report.generate(real_data, synthetic_data, metadata)\n", + "my_report.save(filepath='CopulaGAN_Results/Hulk/diagnostic.pkl')\n", + "my_report.get_visualization('Data Validity')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a46d7cdd-d907-4a49-bb8a-cac585c1776b", + "metadata": {}, + "outputs": [], + "source": [ + "#statistical similarity metric\n", + "sstest=[]\n", + "for i in real_data.columns:\n", + " y=StatisticSimilarity.compute(\n", + " real_data=real_data[i],\n", + " synthetic_data=synthetic_data[i],\n", + " statistic='median'\n", + " )\n", + " sstest.append(y)\n", + "\n", + "df = pd.DataFrame(sstest, columns=['SS Test'])\n", + "\n", + "print(df['SS Test'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c23e2399-5b2c-40aa-9ce7-1bc36ccbe119", + "metadata": {}, + "outputs": [], + "source": [ + "#range coverage metric\n", + "range_coverage=[]\n", + "for i in real_data.columns:\n", + " \n", + " y=RangeCoverage.compute(\n", + " real_data=real_data[i],\n", + " synthetic_data=synthetic_data[i]\n", + " )\n", + " range_coverage.append(y)\n", + "df = pd.DataFrame(range_coverage, columns=['Range Coverage'])\n", + "\n", + "print(df['Range Coverage'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7e00eb7-5133-4374-a7dc-1d131ddc387f", + "metadata": {}, + "outputs": [], + "source": [ + "# checking the number of unique synthetic data instances\n", + "df = pd.concat([real_data, synthetic_data], axis=0)\n", + "print(df.shape)\n", + "df.dropna(inplace=True)\n", + "df.drop_duplicates(inplace=True)\n", + "print(df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85b532bf-91f1-4cb5-8fee-7a74d71a000c", + "metadata": {}, + "outputs": [], + "source": [ + "#Saving the distribution of each column\n", + "def sanitize_column_name(column_name):\n", + " valid_chars = \"-_.() %s%s\" % (string.ascii_letters, string.digits)\n", + " return ''.join(c for c in column_name if c in valid_chars)\n", + "\n", + "for i in real_data.columns:\n", + " fig = get_column_plot(\n", + " real_data=real_data,\n", + " synthetic_data=synthetic_data,\n", + " column_name=i,\n", + " plot_type='bar'\n", + " )\n", + "\n", + " sanitized_column_name = sanitize_column_name(i)\n", + "\n", + " # Save the figure in the 'Pics' directory, change the location accordingly\n", + " py.write_image(fig, os.path.join('CopulaGAN_Results/Hulk/Pics', f\"{sanitized_column_name}.png\")) \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edf6097f-6274-4a32-8b13-c91cea49166f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab