In [None]:
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer 
from sdv.single_table import CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from table_evaluator import TableEvaluator
import matplotlib.pyplot as plt
from sdmetrics.single_column import StatisticSimilarity
import math
from sdmetrics.single_column import RangeCoverage
from sdmetrics.visualization import get_column_plot
import os
import plotly.io as py
import string

In [None]:
#loading the preprocessed datasets 

# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')

print(real_data.shape)
print(real_data.Label.unique())

# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly
#real_data=real_data[real_data.Label=='DoS attacks-Hulk'] # change according to the dataset
real_data = real_data.iloc[:300000, :]
print(real_data.shape)

In [None]:
# Categorical columns & Continuous columns
def get_data_info(df):
    """Crates the categorical columns, continuous columns, and metadata of a dataframe.

    Args:
        df (pandas.Dataframe): The input dataframe containing continuous and categorical values.

    Returns:
        list: the list of categorical column names. Specifically, columns with only 4 uniques values
        list: The list of continuous column names.
        metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
    """
    #createing 
    categorical_columns = ['Label']
    continuous_columns = []

    for i in df.columns:
        if i not in categorical_columns:
            continuous_columns.append(i)
    
    #creating metadat
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)
    
    for column in categorical_columns:
        metadata.update_column(
            column_name = column,
            sdtype = 'categorical'
        )
    
    for column in continuous_columns:
        metadata.update_column(
            column_name = column,
            sdtype = 'numerical'  
        )
    # validating metadata
    metadata.validate()
    metadata.validate_data(data=real_data)
    
    return categorical_columns, continuous_columns, metadata


categorical_columns, continuous_columns, metadata = get_data_info(real_data)

In [None]:
#fiting the synthesizer
# for more info check https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers
#availabel options CTGANSynthesizer, CopulaGANSynthesizer, and TVAESynthesizer

#uncomment below for CTGANSynthesizer and CopulaGANSynthesizer
# synthesizer = CTGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=128, generator_dim=(256, 256), 
#                                discriminator_dim=(256, 256), generator_lr=0.000001, generator_decay=0.000001, epochs=500, discriminator_lr=0.000001, 
#                                discriminator_decay=0.000001, batch_size=300, discriminator_steps=3, log_frequency=True, verbose=True, pac=10)


#uncommnet below for TVAESynthesizer
# synthesizer = TVAESynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=100, compress_dims=(128, 128), 
#                               decompress_dims=(128, 128), l2scale=0.000001, batch_size=500, epochs=500, loss_factor=2, cuda=True)



synthesizer.fit(real_data)
synthesizer.save(filepath='CopulaGAN_Results/Hulk/CopulaGAN.pkl') # change the path accordingly
synthetic_data = synthesizer.sample(300000) # change the instances you want to be genereated

In [None]:
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)
table_evaluator.visual_evaluation()

In [None]:
#saving and visualizing column pair trend and column shapes
metadata = metadata.to_dict()
my_report = QualityReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='CopulaGAN_Results/Hulk/quality.pkl')
my_report.get_visualization(property_name='Column Pair Trends')

In [None]:
#saving and visualiztation data validity
my_report = DiagnosticReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='CopulaGAN_Results/Hulk/diagnostic.pkl')
my_report.get_visualization('Data Validity')

In [None]:
#statistical similarity metric
sstest=[]
for i in real_data.columns:
    y=StatisticSimilarity.compute(
        real_data=real_data[i],
        synthetic_data=synthetic_data[i],
        statistic='median'
        )
    sstest.append(y)

df = pd.DataFrame(sstest, columns=['SS Test'])

print(df['SS Test'].mean())

In [None]:
#range coverage metric
range_coverage=[]
for i in real_data.columns:
    
    y=RangeCoverage.compute(
    real_data=real_data[i],
    synthetic_data=synthetic_data[i]
    )
    range_coverage.append(y)
df = pd.DataFrame(range_coverage, columns=['Range Coverage'])

print(df['Range Coverage'].mean())

In [None]:
# checking the number of unique synthetic data instances
df = pd.concat([real_data, synthetic_data], axis=0)
print(df.shape)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
print(df.shape)

In [None]:
#Saving the distribution of each column
def sanitize_column_name(column_name):
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    return ''.join(c for c in column_name if c in valid_chars)

for i in real_data.columns:
    fig = get_column_plot(
        real_data=real_data,
        synthetic_data=synthetic_data,
        column_name=i,
        plot_type='bar'
    )

    sanitized_column_name = sanitize_column_name(i)

    # Save the figure in the 'Pics' directory, change the location accordingly
    py.write_image(fig, os.path.join('CopulaGAN_Results/Hulk/Pics', f"{sanitized_column_name}.png")) 
