Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MasterThesis - AryanSaeedi
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Urmann
MasterThesis - AryanSaeedi
Commits
d7ddec25
Commit
d7ddec25
authored
1 year ago
by
Sayed Saeedi
Browse files
Options
Downloads
Patches
Plain Diff
codes for CTGAN, CopulaGAN, and TVAE
parent
9b2f5dff
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Models/SDV.ipynb
+288
-0
288 additions, 0 deletions
Models/SDV.ipynb
with
288 additions
and
0 deletions
Models/SDV.ipynb
0 → 100644
+
288
−
0
View file @
d7ddec25
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "aea43308-a588-4730-a10e-48156b0d1aa5",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sdv.single_table import CTGANSynthesizer\n",
"from sdv.single_table import TVAESynthesizer \n",
"from sdv.single_table import CopulaGANSynthesizer\n",
"from sdv.metadata import SingleTableMetadata\n",
"from sdmetrics.reports.single_table import QualityReport\n",
"from sdmetrics.reports.single_table import DiagnosticReport\n",
"from table_evaluator import TableEvaluator\n",
"import matplotlib.pyplot as plt\n",
"from sdmetrics.single_column import StatisticSimilarity\n",
"import math\n",
"from sdmetrics.single_column import RangeCoverage\n",
"from sdmetrics.visualization import get_column_plot\n",
"import os\n",
"import plotly.io as py\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6e0f097-891b-42fc-9e20-85145c8d24ac",
"metadata": {},
"outputs": [],
"source": [
"#loading the preprocessed datasets \n",
"\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n",
"\n",
"print(real_data.shape)\n",
"print(real_data.Label.unique())\n",
"\n",
"# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly\n",
"#real_data=real_data[real_data.Label=='DoS attacks-Hulk'] # change according to the dataset\n",
"real_data = real_data.iloc[:300000, :]\n",
"print(real_data.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc6915d1-0c8a-4d7e-8b87-0c56ae9c6431",
"metadata": {},
"outputs": [],
"source": [
"# Categorical columns & Continuous columns\n",
"def get_data_info(df):\n",
" \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n",
"\n",
" Args:\n",
" df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n",
"\n",
" Returns:\n",
" list: the list of categorical column names. Specifically, columns with only 4 uniques values\n",
" list: The list of continuous column names.\n",
" metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n",
" \"\"\"\n",
" #createing \n",
" categorical_columns = ['Label']\n",
" continuous_columns = []\n",
"\n",
" for i in df.columns:\n",
" if i not in categorical_columns:\n",
" continuous_columns.append(i)\n",
" \n",
" #creating metadat\n",
" metadata = SingleTableMetadata()\n",
" metadata.detect_from_dataframe(df)\n",
" \n",
" for column in categorical_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'categorical'\n",
" )\n",
" \n",
" for column in continuous_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'numerical' \n",
" )\n",
" # validating metadata\n",
" metadata.validate()\n",
" metadata.validate_data(data=real_data)\n",
" \n",
" return categorical_columns, continuous_columns, metadata\n",
"\n",
"\n",
"categorical_columns, continuous_columns, metadata = get_data_info(real_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd625bf1-52ed-4bda-9e69-b62009c0422f",
"metadata": {},
"outputs": [],
"source": [
"#fiting the synthesizer\n",
"# for more info check https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers\n",
"#availabel options CTGANSynthesizer, CopulaGANSynthesizer, and TVAESynthesizer\n",
"\n",
"#uncomment below for CTGANSynthesizer and CopulaGANSynthesizer\n",
"# synthesizer = CTGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=128, generator_dim=(256, 256), \n",
"# discriminator_dim=(256, 256), generator_lr=0.000001, generator_decay=0.000001, epochs=500, discriminator_lr=0.000001, \n",
"# discriminator_decay=0.000001, batch_size=300, discriminator_steps=3, log_frequency=True, verbose=True, pac=10)\n",
"\n",
"\n",
"#uncommnet below for TVAESynthesizer\n",
"# synthesizer = TVAESynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=100, compress_dims=(128, 128), \n",
"# decompress_dims=(128, 128), l2scale=0.000001, batch_size=500, epochs=500, loss_factor=2, cuda=True)\n",
"\n",
"\n",
"\n",
"synthesizer.fit(real_data)\n",
"synthesizer.save(filepath='CopulaGAN_Results/Hulk/CopulaGAN.pkl') # change the path accordingly\n",
"synthetic_data = synthesizer.sample(300000) # change the instances you want to be genereated"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60019781-cf26-4a21-a1b5-0fd099f07972",
"metadata": {},
"outputs": [],
"source": [
"# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n",
"table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n",
"table_evaluator.visual_evaluation()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "273ea89c-161b-4c73-ac96-412effb4e8bb",
"metadata": {},
"outputs": [],
"source": [
"#saving and visualizing column pair trend and column shapes\n",
"metadata = metadata.to_dict()\n",
"my_report = QualityReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='CopulaGAN_Results/Hulk/quality.pkl')\n",
"my_report.get_visualization(property_name='Column Pair Trends')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "169f33e3-f1e3-4677-b092-742db80d9aa6",
"metadata": {},
"outputs": [],
"source": [
"#saving and visualiztation data validity\n",
"my_report = DiagnosticReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='CopulaGAN_Results/Hulk/diagnostic.pkl')\n",
"my_report.get_visualization('Data Validity')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a46d7cdd-d907-4a49-bb8a-cac585c1776b",
"metadata": {},
"outputs": [],
"source": [
"#statistical similarity metric\n",
"sstest=[]\n",
"for i in real_data.columns:\n",
" y=StatisticSimilarity.compute(\n",
" real_data=real_data[i],\n",
" synthetic_data=synthetic_data[i],\n",
" statistic='median'\n",
" )\n",
" sstest.append(y)\n",
"\n",
"df = pd.DataFrame(sstest, columns=['SS Test'])\n",
"\n",
"print(df['SS Test'].mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c23e2399-5b2c-40aa-9ce7-1bc36ccbe119",
"metadata": {},
"outputs": [],
"source": [
"#range coverage metric\n",
"range_coverage=[]\n",
"for i in real_data.columns:\n",
" \n",
" y=RangeCoverage.compute(\n",
" real_data=real_data[i],\n",
" synthetic_data=synthetic_data[i]\n",
" )\n",
" range_coverage.append(y)\n",
"df = pd.DataFrame(range_coverage, columns=['Range Coverage'])\n",
"\n",
"print(df['Range Coverage'].mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7e00eb7-5133-4374-a7dc-1d131ddc387f",
"metadata": {},
"outputs": [],
"source": [
"# checking the number of unique synthetic data instances\n",
"df = pd.concat([real_data, synthetic_data], axis=0)\n",
"print(df.shape)\n",
"df.dropna(inplace=True)\n",
"df.drop_duplicates(inplace=True)\n",
"print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85b532bf-91f1-4cb5-8fee-7a74d71a000c",
"metadata": {},
"outputs": [],
"source": [
"#Saving the distribution of each column\n",
"def sanitize_column_name(column_name):\n",
" valid_chars = \"-_.() %s%s\" % (string.ascii_letters, string.digits)\n",
" return ''.join(c for c in column_name if c in valid_chars)\n",
"\n",
"for i in real_data.columns:\n",
" fig = get_column_plot(\n",
" real_data=real_data,\n",
" synthetic_data=synthetic_data,\n",
" column_name=i,\n",
" plot_type='bar'\n",
" )\n",
"\n",
" sanitized_column_name = sanitize_column_name(i)\n",
"\n",
" # Save the figure in the 'Pics' directory, change the location accordingly\n",
" py.write_image(fig, os.path.join('CopulaGAN_Results/Hulk/Pics', f\"{sanitized_column_name}.png\")) \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "edf6097f-6274-4a32-8b13-c91cea49166f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:aea43308-a588-4730-a10e-48156b0d1aa5 tags:
```
python
import
pandas
as
pd
import
numpy
as
np
from
sdv.single_table
import
CTGANSynthesizer
from
sdv.single_table
import
TVAESynthesizer
from
sdv.single_table
import
CopulaGANSynthesizer
from
sdv.metadata
import
SingleTableMetadata
from
sdmetrics.reports.single_table
import
QualityReport
from
sdmetrics.reports.single_table
import
DiagnosticReport
from
table_evaluator
import
TableEvaluator
import
matplotlib.pyplot
as
plt
from
sdmetrics.single_column
import
StatisticSimilarity
import
math
from
sdmetrics.single_column
import
RangeCoverage
from
sdmetrics.visualization
import
get_column_plot
import
os
import
plotly.io
as
py
import
string
```
%% Cell type:code id:f6e0f097-891b-42fc-9e20-85145c8d24ac tags:
```
python
#loading the preprocessed datasets
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')
print
(
real_data
.
shape
)
print
(
real_data
.
Label
.
unique
())
# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly
#real_data=real_data[real_data.Label=='DoS attacks-Hulk'] # change according to the dataset
real_data
=
real_data
.
iloc
[:
300000
,
:]
print
(
real_data
.
shape
)
```
%% Cell type:code id:bc6915d1-0c8a-4d7e-8b87-0c56ae9c6431 tags:
```
python
# Categorical columns & Continuous columns
def
get_data_info
(
df
):
"""
Crates the categorical columns, continuous columns, and metadata of a dataframe.
Args:
df (pandas.Dataframe): The input dataframe containing continuous and categorical values.
Returns:
list: the list of categorical column names. Specifically, columns with only 4 uniques values
list: The list of continuous column names.
metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
"""
#createing
categorical_columns
=
[
'
Label
'
]
continuous_columns
=
[]
for
i
in
df
.
columns
:
if
i
not
in
categorical_columns
:
continuous_columns
.
append
(
i
)
#creating metadat
metadata
=
SingleTableMetadata
()
metadata
.
detect_from_dataframe
(
df
)
for
column
in
categorical_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
categorical
'
)
for
column
in
continuous_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
numerical
'
)
# validating metadata
metadata
.
validate
()
metadata
.
validate_data
(
data
=
real_data
)
return
categorical_columns
,
continuous_columns
,
metadata
categorical_columns
,
continuous_columns
,
metadata
=
get_data_info
(
real_data
)
```
%% Cell type:code id:bd625bf1-52ed-4bda-9e69-b62009c0422f tags:
```
python
#fiting the synthesizer
# for more info check https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers
#availabel options CTGANSynthesizer, CopulaGANSynthesizer, and TVAESynthesizer
#uncomment below for CTGANSynthesizer and CopulaGANSynthesizer
# synthesizer = CTGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=128, generator_dim=(256, 256),
# discriminator_dim=(256, 256), generator_lr=0.000001, generator_decay=0.000001, epochs=500, discriminator_lr=0.000001,
# discriminator_decay=0.000001, batch_size=300, discriminator_steps=3, log_frequency=True, verbose=True, pac=10)
#uncommnet below for TVAESynthesizer
# synthesizer = TVAESynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, embedding_dim=100, compress_dims=(128, 128),
# decompress_dims=(128, 128), l2scale=0.000001, batch_size=500, epochs=500, loss_factor=2, cuda=True)
synthesizer
.
fit
(
real_data
)
synthesizer
.
save
(
filepath
=
'
CopulaGAN_Results/Hulk/CopulaGAN.pkl
'
)
# change the path accordingly
synthetic_data
=
synthesizer
.
sample
(
300000
)
# change the instances you want to be genereated
```
%% Cell type:code id:60019781-cf26-4a21-a1b5-0fd099f07972 tags:
```
python
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator
=
TableEvaluator
(
real_data
,
synthetic_data
,
cat_cols
=
categorical_columns
)
table_evaluator
.
visual_evaluation
()
```
%% Cell type:code id:273ea89c-161b-4c73-ac96-412effb4e8bb tags:
```
python
#saving and visualizing column pair trend and column shapes
metadata
=
metadata
.
to_dict
()
my_report
=
QualityReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
CopulaGAN_Results/Hulk/quality.pkl
'
)
my_report
.
get_visualization
(
property_name
=
'
Column Pair Trends
'
)
```
%% Cell type:code id:169f33e3-f1e3-4677-b092-742db80d9aa6 tags:
```
python
#saving and visualiztation data validity
my_report
=
DiagnosticReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
CopulaGAN_Results/Hulk/diagnostic.pkl
'
)
my_report
.
get_visualization
(
'
Data Validity
'
)
```
%% Cell type:code id:a46d7cdd-d907-4a49-bb8a-cac585c1776b tags:
```
python
#statistical similarity metric
sstest
=
[]
for
i
in
real_data
.
columns
:
y
=
StatisticSimilarity
.
compute
(
real_data
=
real_data
[
i
],
synthetic_data
=
synthetic_data
[
i
],
statistic
=
'
median
'
)
sstest
.
append
(
y
)
df
=
pd
.
DataFrame
(
sstest
,
columns
=
[
'
SS Test
'
])
print
(
df
[
'
SS Test
'
].
mean
())
```
%% Cell type:code id:c23e2399-5b2c-40aa-9ce7-1bc36ccbe119 tags:
```
python
#range coverage metric
range_coverage
=
[]
for
i
in
real_data
.
columns
:
y
=
RangeCoverage
.
compute
(
real_data
=
real_data
[
i
],
synthetic_data
=
synthetic_data
[
i
]
)
range_coverage
.
append
(
y
)
df
=
pd
.
DataFrame
(
range_coverage
,
columns
=
[
'
Range Coverage
'
])
print
(
df
[
'
Range Coverage
'
].
mean
())
```
%% Cell type:code id:d7e00eb7-5133-4374-a7dc-1d131ddc387f tags:
```
python
# checking the number of unique synthetic data instances
df
=
pd
.
concat
([
real_data
,
synthetic_data
],
axis
=
0
)
print
(
df
.
shape
)
df
.
dropna
(
inplace
=
True
)
df
.
drop_duplicates
(
inplace
=
True
)
print
(
df
.
shape
)
```
%% Cell type:code id:85b532bf-91f1-4cb5-8fee-7a74d71a000c tags:
```
python
#Saving the distribution of each column
def
sanitize_column_name
(
column_name
):
valid_chars
=
"
-_.() %s%s
"
%
(
string
.
ascii_letters
,
string
.
digits
)
return
''
.
join
(
c
for
c
in
column_name
if
c
in
valid_chars
)
for
i
in
real_data
.
columns
:
fig
=
get_column_plot
(
real_data
=
real_data
,
synthetic_data
=
synthetic_data
,
column_name
=
i
,
plot_type
=
'
bar
'
)
sanitized_column_name
=
sanitize_column_name
(
i
)
# Save the figure in the 'Pics' directory, change the location accordingly
py
.
write_image
(
fig
,
os
.
path
.
join
(
'
CopulaGAN_Results/Hulk/Pics
'
,
f
"
{
sanitized_column_name
}
.png
"
))
```
%% Cell type:code id:edf6097f-6274-4a32-8b13-c91cea49166f tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment