Skip to content
Snippets Groups Projects
Commit 5afcba29 authored by Sayed Saeedi's avatar Sayed Saeedi
Browse files

Code for combining synthetic data and evaluating

parent d737673d
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:afdb8408-8328-49ec-95ad-1ad1b45217f8 tags:
``` python
import pandas as pd
import numpy as np
import os
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from table_evaluator import TableEvaluator
```
%% Cell type:code id:1eb44865-7c03-46df-9780-aca27a6d0494 tags:
``` python
#creating a combination of real data
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# brute = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# goldenEye = real_data[real_data.Label=='DoS attacks-GoldenEye']
# slowloris = real_data[real_data.Label=='DoS attacks-Slowloris']
# hulk = real_data[real_data.Label=='DoS attacks-Hulk']
# hulk = hulk.iloc[:300000, :]
# slowHTTPtest = real_data[real_data.Label=='DoS attacks-SlowHTTPTest']
# loicHTTp = real_data[real_data.Label=='DDoS attacks-LOIC-HTTP']
# loicHTTp = loicHTTp.iloc[:300000, :]
# hoic = real_data[real_data.Label=='DDOS attack-HOIC']
# hoic = hoic.iloc[:300000, :]
# bot = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# infilteration = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')
# benign = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# benign = benign.iloc[:300000, :]
# ftp = brute[brute.Label=='FTP-BruteForce']
# ssh = brute[brute.Label=='SSH-Bruteforce']
# real_all_in_one = pd.concat([goldenEye, slowloris, hulk, slowHTTPtest, loicHTTp, hoic, bot, infilteration, benign, ftp, ssh], ignore_index=True)
```
%% Cell type:code id:79233ab4-949f-40bb-ae7b-f338d5076a13 tags:
``` python
# reading all csv files in the directory
folder_path = 'RTVAE_Results'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
dfs = []
for file in csv_files:
file_path = os.path.join(folder_path, file)
dfs.append(pd.read_csv(file_path))
synthetic_all_in_one = pd.concat(dfs, ignore_index=True)
```
%% Cell type:code id:40d00c4b-31a6-4a88-8b53-9d212d4f807e tags:
``` python
#savign real and synthetic data
# real_all_in_one.to_csv('TVAE_Results/real_all_in_one.csv', index=False)
synthetic_all_in_one.to_csv('RTVAE_Results/RTVAE_synthetic_all.csv', index=False)
```
%% Cell type:code id:a7e4c7ea-8b6d-4537-83ed-cda57173a704 tags:
``` python
#ensuring that everthing went well
synthetic_data = synthetic_all_in_one
print(synthetic_data.shape)
synthetic_data.dropna(inplace=True)
print(synthetic_data.shape)
```
%% Cell type:code id:ae9bd1d9-52d7-4703-890c-a2588a15b417 tags:
``` python
#Loading real and sytnehtic data for evaluation
real_data = pd.read_csv('TVAE_Results/real_all_in_one.csv')
synthetic_data = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')
```
%% Cell type:code id:549bce4a-e6ea-457d-91a9-dfa9f95fe073 tags:
``` python
def get_data_info(df):
"""Crates the categorical columns, continuous columns, and metadata of a dataframe.
Args:
df (pandas.Dataframe): The input dataframe containing continuous and categorical values.
Returns:
list: the list of categorical column names. Specifically, columns with only 4 uniques values
list: The list of continuous column names.
metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
"""
#createing
categorical_columns = ['Label']
continuous_columns = []
for i in df.columns:
if i not in categorical_columns:
continuous_columns.append(i)
#creating metadat
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
for column in categorical_columns:
metadata.update_column(
column_name = column,
sdtype = 'categorical'
)
for column in continuous_columns:
metadata.update_column(
column_name = column,
sdtype = 'numerical'
)
# validating metadata
metadata.validate()
metadata.validate_data(data=real_data)
return categorical_columns, continuous_columns, metadata
categorical_columns, continuous_columns, metadata = get_data_info(real_data)
```
%% Cell type:code id:fc6db1ff-e2fc-4817-87d1-22743468b7b8 tags:
``` python
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)
table_evaluator.visual_evaluation()
```
%% Cell type:code id:7bc79e6f-0232-4e2b-9b19-b2fd8825a687 tags:
``` python
#saving and visualizing column pair trend and column shapes
metadata = metadata.to_dict()
my_report = QualityReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='RTVAE_Results/quality.pkl')
my_report.get_visualization(property_name='Column Pair Trends')
```
%% Cell type:code id:9332967a-f289-454a-9e20-ff73c7f9bbcf tags:
``` python
#saving and visualiztation data validity
my_report = DiagnosticReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='RTVAE_Results/diagnostic.pkl')
my_report.get_visualization('Data Validity')
```
%% Cell type:code id:ad3689d9-2361-4dfa-8efb-8d72d2859b7a tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment