Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MasterThesis - AryanSaeedi
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Urmann
MasterThesis - AryanSaeedi
Commits
1a322564
Commit
1a322564
authored
1 year ago
by
Sayed Saeedi
Browse files
Options
Downloads
Patches
Plain Diff
Codes for ADSGAN and RTVAE
parent
47c8160a
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Models/Synthcity.ipynb
+281
-0
281 additions, 0 deletions
Models/Synthcity.ipynb
with
281 additions
and
0 deletions
Models/Synthcity.ipynb
0 → 100644
+
281
−
0
View file @
1a322564
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "299dec31",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sdv.metadata import SingleTableMetadata\n",
"from sdmetrics.reports.single_table import QualityReport\n",
"from sdmetrics.reports.single_table import DiagnosticReport\n",
"from table_evaluator import TableEvaluator\n",
"import matplotlib.pyplot as plt\n",
"from sdmetrics.single_column import StatisticSimilarity\n",
"import math\n",
"from sdmetrics.single_column import RangeCoverage\n",
"from sdmetrics.visualization import get_column_plot\n",
"import os\n",
"import plotly.io as py\n",
"import string\n",
"\n",
"from synthcity.plugins import Plugins\n",
"\n",
"#Plugins(categories=[\"generic\", \"privacy\"]).list() #uncomment to see a list of model for generating data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6127a704",
"metadata": {},
"outputs": [],
"source": [
"#loading the preprocessed datasets \n",
"\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n",
"\n",
"print(real_data.shape)\n",
"print(real_data.Label.unique())\n",
"\n",
"# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly\n",
"#real_data=real_data[real_data.Label=='SSH-Bruteforce'] # change according to the dataset\n",
"real_data = real_data.iloc[:300000, :]\n",
"print(real_data.shape) # "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c41b506-aa5c-419c-8a49-f246de3ee6ae",
"metadata": {},
"outputs": [],
"source": [
"# imported from https://github.com/vanderschaarlab/synthcity/tree/main\n",
"#if using ADSGAN just change \"rtvae\" to \"adsgan\"\n",
"\n",
"syn_model = Plugins().get(\"rtvae\", n_iter= 500, lr=0.0001, batch_size= 300, decoder_n_layers_hidden=4, encoder_n_layers_hidden=4)\n",
"\n",
"syn_model.fit(real_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2510624-8cfc-480b-88ee-e2234117cb50",
"metadata": {},
"outputs": [],
"source": [
"# generating synthetic data and saving the file\n",
"synthetic_data=syn_model.generate(300000).dataframe()\n",
"synthetic_data.to_csv('RTVAE_Results/LOICHTTP.csv', index=False) #similar to the loaded dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d895ace2-e2d9-4ee0-886a-742aebcbd6c0",
"metadata": {},
"outputs": [],
"source": [
"def get_data_info(df):\n",
" \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n",
"\n",
" Args:\n",
" df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n",
"\n",
" Returns:\n",
" list: the list of categorical column names. Specifically, columns with only 4 uniques values\n",
" list: The list of continuous column names.\n",
" metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n",
" \"\"\"\n",
" #createing \n",
" categorical_columns = ['Label']\n",
" continuous_columns = []\n",
" for i in df.columns:\n",
" if i not in categorical_columns:\n",
" continuous_columns.append(i)\n",
" \n",
" #creating metadat\n",
" metadata = SingleTableMetadata()\n",
" metadata.detect_from_dataframe(df)\n",
" \n",
" for column in categorical_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'categorical'\n",
" )\n",
" \n",
" for column in continuous_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'numerical' \n",
" )\n",
" # validating metadata\n",
" metadata.validate()\n",
" metadata.validate_data(data=real_data)\n",
" \n",
" return categorical_columns, continuous_columns, metadata\n",
"\n",
"\n",
"categorical_columns, continuous_columns, metadata = get_data_info(real_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90330684-4fed-4571-9026-4cb04250e475",
"metadata": {},
"outputs": [],
"source": [
"# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n",
"table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n",
"table_evaluator.visual_evaluation()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8833c4f-ec42-413f-96ec-34516401ec8b",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"#saving and visualizing column pair trend and column shapes\n",
"metadata = metadata.to_dict()\n",
"my_report = QualityReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='RTVAE_Results/LOICHTTP/quality.pkl')\n",
"my_report.get_visualization(property_name='Column Pair Trends')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a510048-8949-45a4-b9d7-543b211fc710",
"metadata": {},
"outputs": [],
"source": [
"#saving and visualiztation data validity\n",
"#metadata = metadata.to_dict()\n",
"my_report = DiagnosticReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='RTVAE_Results/LOICHTTP.csv/diagnostic.pkl')\n",
"#my_report.get_visualization('Data Validity')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "668f18aa-5a36-4a72-8c47-5c549a7a5a86",
"metadata": {},
"outputs": [],
"source": [
"#statistical similarity metric\n",
"sstest=[]\n",
"for i in real_data.columns:\n",
" y=StatisticSimilarity.compute(\n",
" real_data=real_data[i],\n",
" synthetic_data=synthetic_data[i],\n",
" statistic='median'\n",
" )\n",
" sstest.append(y)\n",
"\n",
"df = pd.DataFrame(sstest, columns=['SS Test'])\n",
"\n",
"print(df['SS Test'].mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "142c89a4-7977-49a0-8aed-69edf12ea07b",
"metadata": {},
"outputs": [],
"source": [
"#range coverage metric\n",
"range_coverage=[]\n",
"for i in real_data.columns:\n",
" \n",
" y=RangeCoverage.compute(\n",
" real_data=real_data[i],\n",
" synthetic_data=synthetic_data[i]\n",
" )\n",
" range_coverage.append(y)\n",
"df = pd.DataFrame(range_coverage, columns=['Range Coverage'])\n",
"\n",
"print(df['Range Coverage'].mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62ec2b27-5262-4906-a1b3-eb755c7dc0da",
"metadata": {},
"outputs": [],
"source": [
"# checking the number of unique synthetic data instances\n",
"df = pd.concat([real_data, synthetic_data], axis=0)\n",
"print(df.shape)\n",
"df.dropna(inplace=True)\n",
"df.drop_duplicates(inplace=True)\n",
"print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53ff9fbd-1632-4d5f-84ce-f64d13305a9b",
"metadata": {},
"outputs": [],
"source": [
"#Saving the distribution of each column\n",
"def sanitize_column_name(column_name):\n",
" valid_chars = \"-_.() %s%s\" % (string.ascii_letters, string.digits)\n",
" return ''.join(c for c in column_name if c in valid_chars)\n",
"\n",
"for i in real_data.columns:\n",
" fig = get_column_plot(\n",
" real_data=real_data,\n",
" synthetic_data=synthetic_data,\n",
" column_name=i,\n",
" plot_type='bar'\n",
" )\n",
"\n",
" sanitized_column_name = sanitize_column_name(i)\n",
"\n",
" # Save the figure in the 'Pics' directory, change the location accordingly\n",
" py.write_image(fig, os.path.join('RTVAE_Results/LOICHTTP/Pics', f\"{sanitized_column_name}.png\")) \n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:299dec31 tags:
```
python
import
pandas
as
pd
import
numpy
as
np
from
sdv.metadata
import
SingleTableMetadata
from
sdmetrics.reports.single_table
import
QualityReport
from
sdmetrics.reports.single_table
import
DiagnosticReport
from
table_evaluator
import
TableEvaluator
import
matplotlib.pyplot
as
plt
from
sdmetrics.single_column
import
StatisticSimilarity
import
math
from
sdmetrics.single_column
import
RangeCoverage
from
sdmetrics.visualization
import
get_column_plot
import
os
import
plotly.io
as
py
import
string
from
synthcity.plugins
import
Plugins
#Plugins(categories=["generic", "privacy"]).list() #uncomment to see a list of model for generating data
```
%% Cell type:code id:6127a704 tags:
```
python
#loading the preprocessed datasets
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')
print
(
real_data
.
shape
)
print
(
real_data
.
Label
.
unique
())
# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly
#real_data=real_data[real_data.Label=='SSH-Bruteforce'] # change according to the dataset
real_data
=
real_data
.
iloc
[:
300000
,
:]
print
(
real_data
.
shape
)
#
```
%% Cell type:code id:9c41b506-aa5c-419c-8a49-f246de3ee6ae tags:
```
python
# imported from https://github.com/vanderschaarlab/synthcity/tree/main
#if using ADSGAN just change "rtvae" to "adsgan"
syn_model
=
Plugins
().
get
(
"
rtvae
"
,
n_iter
=
500
,
lr
=
0.0001
,
batch_size
=
300
,
decoder_n_layers_hidden
=
4
,
encoder_n_layers_hidden
=
4
)
syn_model
.
fit
(
real_data
)
```
%% Cell type:code id:c2510624-8cfc-480b-88ee-e2234117cb50 tags:
```
python
# generating synthetic data and saving the file
synthetic_data
=
syn_model
.
generate
(
300000
).
dataframe
()
synthetic_data
.
to_csv
(
'
RTVAE_Results/LOICHTTP.csv
'
,
index
=
False
)
#similar to the loaded dataset
```
%% Cell type:code id:d895ace2-e2d9-4ee0-886a-742aebcbd6c0 tags:
```
python
def
get_data_info
(
df
):
"""
Crates the categorical columns, continuous columns, and metadata of a dataframe.
Args:
df (pandas.Dataframe): The input dataframe containing continuous and categorical values.
Returns:
list: the list of categorical column names. Specifically, columns with only 4 uniques values
list: The list of continuous column names.
metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
"""
#createing
categorical_columns
=
[
'
Label
'
]
continuous_columns
=
[]
for
i
in
df
.
columns
:
if
i
not
in
categorical_columns
:
continuous_columns
.
append
(
i
)
#creating metadat
metadata
=
SingleTableMetadata
()
metadata
.
detect_from_dataframe
(
df
)
for
column
in
categorical_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
categorical
'
)
for
column
in
continuous_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
numerical
'
)
# validating metadata
metadata
.
validate
()
metadata
.
validate_data
(
data
=
real_data
)
return
categorical_columns
,
continuous_columns
,
metadata
categorical_columns
,
continuous_columns
,
metadata
=
get_data_info
(
real_data
)
```
%% Cell type:code id:90330684-4fed-4571-9026-4cb04250e475 tags:
```
python
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator
=
TableEvaluator
(
real_data
,
synthetic_data
,
cat_cols
=
categorical_columns
)
table_evaluator
.
visual_evaluation
()
```
%% Cell type:code id:d8833c4f-ec42-413f-96ec-34516401ec8b tags:
```
python
#saving and visualizing column pair trend and column shapes
metadata
=
metadata
.
to_dict
()
my_report
=
QualityReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
RTVAE_Results/LOICHTTP/quality.pkl
'
)
my_report
.
get_visualization
(
property_name
=
'
Column Pair Trends
'
)
```
%% Cell type:code id:9a510048-8949-45a4-b9d7-543b211fc710 tags:
```
python
#saving and visualiztation data validity
#metadata = metadata.to_dict()
my_report
=
DiagnosticReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
RTVAE_Results/LOICHTTP.csv/diagnostic.pkl
'
)
#my_report.get_visualization('Data Validity')
```
%% Cell type:code id:668f18aa-5a36-4a72-8c47-5c549a7a5a86 tags:
```
python
#statistical similarity metric
sstest
=
[]
for
i
in
real_data
.
columns
:
y
=
StatisticSimilarity
.
compute
(
real_data
=
real_data
[
i
],
synthetic_data
=
synthetic_data
[
i
],
statistic
=
'
median
'
)
sstest
.
append
(
y
)
df
=
pd
.
DataFrame
(
sstest
,
columns
=
[
'
SS Test
'
])
print
(
df
[
'
SS Test
'
].
mean
())
```
%% Cell type:code id:142c89a4-7977-49a0-8aed-69edf12ea07b tags:
```
python
#range coverage metric
range_coverage
=
[]
for
i
in
real_data
.
columns
:
y
=
RangeCoverage
.
compute
(
real_data
=
real_data
[
i
],
synthetic_data
=
synthetic_data
[
i
]
)
range_coverage
.
append
(
y
)
df
=
pd
.
DataFrame
(
range_coverage
,
columns
=
[
'
Range Coverage
'
])
print
(
df
[
'
Range Coverage
'
].
mean
())
```
%% Cell type:code id:62ec2b27-5262-4906-a1b3-eb755c7dc0da tags:
```
python
# checking the number of unique synthetic data instances
df
=
pd
.
concat
([
real_data
,
synthetic_data
],
axis
=
0
)
print
(
df
.
shape
)
df
.
dropna
(
inplace
=
True
)
df
.
drop_duplicates
(
inplace
=
True
)
print
(
df
.
shape
)
```
%% Cell type:code id:53ff9fbd-1632-4d5f-84ce-f64d13305a9b tags:
```
python
#Saving the distribution of each column
def
sanitize_column_name
(
column_name
):
valid_chars
=
"
-_.() %s%s
"
%
(
string
.
ascii_letters
,
string
.
digits
)
return
''
.
join
(
c
for
c
in
column_name
if
c
in
valid_chars
)
for
i
in
real_data
.
columns
:
fig
=
get_column_plot
(
real_data
=
real_data
,
synthetic_data
=
synthetic_data
,
column_name
=
i
,
plot_type
=
'
bar
'
)
sanitized_column_name
=
sanitize_column_name
(
i
)
# Save the figure in the 'Pics' directory, change the location accordingly
py
.
write_image
(
fig
,
os
.
path
.
join
(
'
RTVAE_Results/LOICHTTP/Pics
'
,
f
"
{
sanitized_column_name
}
.png
"
))
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment