Skip to content
Snippets Groups Projects
Commit d7ddec25 authored by Sayed Saeedi's avatar Sayed Saeedi
Browse files

codes for CTGAN, CopulaGAN, and TVAE

parent 9b2f5dff
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:aea43308-a588-4730-a10e-48156b0d1aa5 tags:
``` python
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from table_evaluator import TableEvaluator
import matplotlib.pyplot as plt
from sdmetrics.single_column import StatisticSimilarity
import math
from sdmetrics.single_column import RangeCoverage
from sdmetrics.visualization import get_column_plot
import os
import plotly.io as py
import string
```
%% Cell type:code id:f6e0f097-891b-42fc-9e20-85145c8d24ac tags:
``` python
#loading the preprocessed datasets
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')
print(real_data.shape)
print(real_data.Label.unique())
# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly
#real_data=real_data[real_data.Label=='DoS attacks-Hulk'] # change according to the dataset
real_data = real_data.iloc[:300000, :]
print(real_data.shape)
```
%% Cell type:code id:bc6915d1-0c8a-4d7e-8b87-0c56ae9c6431 tags:
``` python
# Categorical columns & Continuous columns
def get_data_info(df):
"""Crates the categorical columns, continuous columns, and metadata of a dataframe.
Args:
df (pandas.Dataframe): The input dataframe containing continuous and categorical values.
Returns:
list: the list of categorical column names. Specifically, columns with only 4 uniques values
list: The list of continuous column names.
metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
"""
#createing
categorical_columns = ['Label']
continuous_columns = []
for i in df.columns:
if i not in categorical_columns:
continuous_columns.append(i)
#creating metadat
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
for column in categorical_columns:
metadata.update_column(
column_name = column,
sdtype = 'categorical'
)
for column in continuous_columns:
metadata.update_column(
column_name = column,
sdtype = 'numerical'
)
# validating metadata
metadata.validate()
metadata.validate_data(data=real_data)
return categorical_columns, continuous_columns, metadata
categorical_columns, continuous_columns, metadata = get_data_info(real_data)
```
%% Cell type:code id:bd625bf1-52ed-4bda-9e69-b62009c0422f tags:
``` python
#fiting the synthesizer
# for more info check https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers
#availabel options CTGANSynthesizer, CopulaGANSynthesizer, and TVAESynthesizer
#uncomment below for CTGANSynthesizer and CopulaGANSynthesizer
# synthesizer = CTGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=128, generator_dim=(256, 256),
# discriminator_dim=(256, 256), generator_lr=0.000001, generator_decay=0.000001, epochs=500, discriminator_lr=0.000001,
# discriminator_decay=0.000001, batch_size=300, discriminator_steps=3, log_frequency=True, verbose=True, pac=10)
#uncommnet below for TVAESynthesizer
# synthesizer = TVAESynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=100, compress_dims=(128, 128),
# decompress_dims=(128, 128), l2scale=0.000001, batch_size=500, epochs=500, loss_factor=2, cuda=True)
synthesizer.fit(real_data)
synthesizer.save(filepath='CopulaGAN_Results/Hulk/CopulaGAN.pkl') # change the path accordingly
synthetic_data = synthesizer.sample(300000) # change the instances you want to be genereated
```
%% Cell type:code id:60019781-cf26-4a21-a1b5-0fd099f07972 tags:
``` python
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)
table_evaluator.visual_evaluation()
```
%% Cell type:code id:273ea89c-161b-4c73-ac96-412effb4e8bb tags:
``` python
#saving and visualizing column pair trend and column shapes
metadata = metadata.to_dict()
my_report = QualityReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='CopulaGAN_Results/Hulk/quality.pkl')
my_report.get_visualization(property_name='Column Pair Trends')
```
%% Cell type:code id:169f33e3-f1e3-4677-b092-742db80d9aa6 tags:
``` python
#saving and visualiztation data validity
my_report = DiagnosticReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='CopulaGAN_Results/Hulk/diagnostic.pkl')
my_report.get_visualization('Data Validity')
```
%% Cell type:code id:a46d7cdd-d907-4a49-bb8a-cac585c1776b tags:
``` python
#statistical similarity metric
sstest=[]
for i in real_data.columns:
y=StatisticSimilarity.compute(
real_data=real_data[i],
synthetic_data=synthetic_data[i],
statistic='median'
)
sstest.append(y)
df = pd.DataFrame(sstest, columns=['SS Test'])
print(df['SS Test'].mean())
```
%% Cell type:code id:c23e2399-5b2c-40aa-9ce7-1bc36ccbe119 tags:
``` python
#range coverage metric
range_coverage=[]
for i in real_data.columns:
y=RangeCoverage.compute(
real_data=real_data[i],
synthetic_data=synthetic_data[i]
)
range_coverage.append(y)
df = pd.DataFrame(range_coverage, columns=['Range Coverage'])
print(df['Range Coverage'].mean())
```
%% Cell type:code id:d7e00eb7-5133-4374-a7dc-1d131ddc387f tags:
``` python
# checking the number of unique synthetic data instances
df = pd.concat([real_data, synthetic_data], axis=0)
print(df.shape)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
print(df.shape)
```
%% Cell type:code id:85b532bf-91f1-4cb5-8fee-7a74d71a000c tags:
``` python
#Saving the distribution of each column
def sanitize_column_name(column_name):
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return ''.join(c for c in column_name if c in valid_chars)
for i in real_data.columns:
fig = get_column_plot(
real_data=real_data,
synthetic_data=synthetic_data,
column_name=i,
plot_type='bar'
)
sanitized_column_name = sanitize_column_name(i)
# Save the figure in the 'Pics' directory, change the location accordingly
py.write_image(fig, os.path.join('CopulaGAN_Results/Hulk/Pics', f"{sanitized_column_name}.png"))
```
%% Cell type:code id:edf6097f-6274-4a32-8b13-c91cea49166f tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment