Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MasterThesis - AryanSaeedi
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Urmann
MasterThesis - AryanSaeedi
Commits
5afcba29
Commit
5afcba29
authored
1 year ago
by
Sayed Saeedi
Browse files
Options
Downloads
Patches
Plain Diff
Code for combining synthetic data and evaluating
parent
d737673d
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Models/Combining_everything.ipynb
+230
-0
230 additions, 0 deletions
Models/Combining_everything.ipynb
with
230 additions
and
0 deletions
Models/Combining_everything.ipynb
0 → 100644
+
230
−
0
View file @
5afcba29
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "afdb8408-8328-49ec-95ad-1ad1b45217f8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from sdv.single_table import CTGANSynthesizer\n",
"from sdv.metadata import SingleTableMetadata\n",
"from sdmetrics.reports.single_table import QualityReport\n",
"from sdmetrics.reports.single_table import DiagnosticReport\n",
"from table_evaluator import TableEvaluator"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1eb44865-7c03-46df-9780-aca27a6d0494",
"metadata": {},
"outputs": [],
"source": [
"#creating a combination of real data\n",
"\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n",
"# brute = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n",
"# goldenEye = real_data[real_data.Label=='DoS attacks-GoldenEye']\n",
"# slowloris = real_data[real_data.Label=='DoS attacks-Slowloris']\n",
"# hulk = real_data[real_data.Label=='DoS attacks-Hulk']\n",
"# hulk = hulk.iloc[:300000, :]\n",
"# slowHTTPtest = real_data[real_data.Label=='DoS attacks-SlowHTTPTest']\n",
"# loicHTTp = real_data[real_data.Label=='DDoS attacks-LOIC-HTTP']\n",
"# loicHTTp = loicHTTp.iloc[:300000, :]\n",
"# hoic = real_data[real_data.Label=='DDOS attack-HOIC']\n",
"# hoic = hoic.iloc[:300000, :]\n",
"# bot = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n",
"# infilteration = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n",
"# benign = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n",
"# benign = benign.iloc[:300000, :]\n",
"# ftp = brute[brute.Label=='FTP-BruteForce']\n",
"# ssh = brute[brute.Label=='SSH-Bruteforce']\n",
"\n",
"# real_all_in_one = pd.concat([goldenEye, slowloris, hulk, slowHTTPtest, loicHTTp, hoic, bot, infilteration, benign, ftp, ssh], ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79233ab4-949f-40bb-ae7b-f338d5076a13",
"metadata": {},
"outputs": [],
"source": [
"# reading all csv files in the directory\n",
"\n",
"folder_path = 'RTVAE_Results'\n",
"csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]\n",
"dfs = []\n",
"for file in csv_files:\n",
" file_path = os.path.join(folder_path, file)\n",
" dfs.append(pd.read_csv(file_path))\n",
" \n",
"synthetic_all_in_one = pd.concat(dfs, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40d00c4b-31a6-4a88-8b53-9d212d4f807e",
"metadata": {},
"outputs": [],
"source": [
"#savign real and synthetic data\n",
"# real_all_in_one.to_csv('TVAE_Results/real_all_in_one.csv', index=False)\n",
"synthetic_all_in_one.to_csv('RTVAE_Results/RTVAE_synthetic_all.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7e4c7ea-8b6d-4537-83ed-cda57173a704",
"metadata": {},
"outputs": [],
"source": [
"#ensuring that everthing went well\n",
"\n",
"synthetic_data = synthetic_all_in_one \n",
"print(synthetic_data.shape)\n",
"synthetic_data.dropna(inplace=True)\n",
"print(synthetic_data.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae9bd1d9-52d7-4703-890c-a2588a15b417",
"metadata": {},
"outputs": [],
"source": [
"#Loading real and sytnehtic data for evaluation\n",
"\n",
"real_data = pd.read_csv('TVAE_Results/real_all_in_one.csv')\n",
"synthetic_data = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "549bce4a-e6ea-457d-91a9-dfa9f95fe073",
"metadata": {},
"outputs": [],
"source": [
"def get_data_info(df):\n",
" \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n",
"\n",
" Args:\n",
" df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n",
"\n",
" Returns:\n",
" list: the list of categorical column names. Specifically, columns with only 4 uniques values\n",
" list: The list of continuous column names.\n",
" metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n",
" \"\"\"\n",
" #createing \n",
" categorical_columns = ['Label']\n",
" continuous_columns = []\n",
" for i in df.columns:\n",
" if i not in categorical_columns:\n",
" continuous_columns.append(i)\n",
" \n",
" #creating metadat\n",
" metadata = SingleTableMetadata()\n",
" metadata.detect_from_dataframe(df)\n",
" \n",
" for column in categorical_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'categorical'\n",
" )\n",
" \n",
" for column in continuous_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'numerical' \n",
" )\n",
" # validating metadata\n",
" metadata.validate()\n",
" metadata.validate_data(data=real_data)\n",
" \n",
" return categorical_columns, continuous_columns, metadata\n",
"\n",
"\n",
"categorical_columns, continuous_columns, metadata = get_data_info(real_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc6db1ff-e2fc-4817-87d1-22743468b7b8",
"metadata": {},
"outputs": [],
"source": [
"# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n",
"table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n",
"table_evaluator.visual_evaluation()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7bc79e6f-0232-4e2b-9b19-b2fd8825a687",
"metadata": {},
"outputs": [],
"source": [
"#saving and visualizing column pair trend and column shapes\n",
"metadata = metadata.to_dict()\n",
"my_report = QualityReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='RTVAE_Results/quality.pkl')\n",
"my_report.get_visualization(property_name='Column Pair Trends')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9332967a-f289-454a-9e20-ff73c7f9bbcf",
"metadata": {},
"outputs": [],
"source": [
"#saving and visualiztation data validity\n",
"my_report = DiagnosticReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='RTVAE_Results/diagnostic.pkl')\n",
"my_report.get_visualization('Data Validity')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad3689d9-2361-4dfa-8efb-8d72d2859b7a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:afdb8408-8328-49ec-95ad-1ad1b45217f8 tags:
```
python
import
pandas
as
pd
import
numpy
as
np
import
os
from
sdv.single_table
import
CTGANSynthesizer
from
sdv.metadata
import
SingleTableMetadata
from
sdmetrics.reports.single_table
import
QualityReport
from
sdmetrics.reports.single_table
import
DiagnosticReport
from
table_evaluator
import
TableEvaluator
```
%% Cell type:code id:1eb44865-7c03-46df-9780-aca27a6d0494 tags:
```
python
#creating a combination of real data
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# brute = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# goldenEye = real_data[real_data.Label=='DoS attacks-GoldenEye']
# slowloris = real_data[real_data.Label=='DoS attacks-Slowloris']
# hulk = real_data[real_data.Label=='DoS attacks-Hulk']
# hulk = hulk.iloc[:300000, :]
# slowHTTPtest = real_data[real_data.Label=='DoS attacks-SlowHTTPTest']
# loicHTTp = real_data[real_data.Label=='DDoS attacks-LOIC-HTTP']
# loicHTTp = loicHTTp.iloc[:300000, :]
# hoic = real_data[real_data.Label=='DDOS attack-HOIC']
# hoic = hoic.iloc[:300000, :]
# bot = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# infilteration = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')
# benign = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# benign = benign.iloc[:300000, :]
# ftp = brute[brute.Label=='FTP-BruteForce']
# ssh = brute[brute.Label=='SSH-Bruteforce']
# real_all_in_one = pd.concat([goldenEye, slowloris, hulk, slowHTTPtest, loicHTTp, hoic, bot, infilteration, benign, ftp, ssh], ignore_index=True)
```
%% Cell type:code id:79233ab4-949f-40bb-ae7b-f338d5076a13 tags:
```
python
# reading all csv files in the directory
folder_path
=
'
RTVAE_Results
'
csv_files
=
[
file
for
file
in
os
.
listdir
(
folder_path
)
if
file
.
endswith
(
'
.csv
'
)]
dfs
=
[]
for
file
in
csv_files
:
file_path
=
os
.
path
.
join
(
folder_path
,
file
)
dfs
.
append
(
pd
.
read_csv
(
file_path
))
synthetic_all_in_one
=
pd
.
concat
(
dfs
,
ignore_index
=
True
)
```
%% Cell type:code id:40d00c4b-31a6-4a88-8b53-9d212d4f807e tags:
```
python
#savign real and synthetic data
# real_all_in_one.to_csv('TVAE_Results/real_all_in_one.csv', index=False)
synthetic_all_in_one
.
to_csv
(
'
RTVAE_Results/RTVAE_synthetic_all.csv
'
,
index
=
False
)
```
%% Cell type:code id:a7e4c7ea-8b6d-4537-83ed-cda57173a704 tags:
```
python
#ensuring that everthing went well
synthetic_data
=
synthetic_all_in_one
print
(
synthetic_data
.
shape
)
synthetic_data
.
dropna
(
inplace
=
True
)
print
(
synthetic_data
.
shape
)
```
%% Cell type:code id:ae9bd1d9-52d7-4703-890c-a2588a15b417 tags:
```
python
#Loading real and sytnehtic data for evaluation
real_data
=
pd
.
read_csv
(
'
TVAE_Results/real_all_in_one.csv
'
)
synthetic_data
=
pd
.
read_csv
(
'
TabFairGAN_Results/TabFairGAN_synthetic_all.csv
'
)
```
%% Cell type:code id:549bce4a-e6ea-457d-91a9-dfa9f95fe073 tags:
```
python
def
get_data_info
(
df
):
"""
Crates the categorical columns, continuous columns, and metadata of a dataframe.
Args:
df (pandas.Dataframe): The input dataframe containing continuous and categorical values.
Returns:
list: the list of categorical column names. Specifically, columns with only 4 uniques values
list: The list of continuous column names.
metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
"""
#createing
categorical_columns
=
[
'
Label
'
]
continuous_columns
=
[]
for
i
in
df
.
columns
:
if
i
not
in
categorical_columns
:
continuous_columns
.
append
(
i
)
#creating metadat
metadata
=
SingleTableMetadata
()
metadata
.
detect_from_dataframe
(
df
)
for
column
in
categorical_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
categorical
'
)
for
column
in
continuous_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
numerical
'
)
# validating metadata
metadata
.
validate
()
metadata
.
validate_data
(
data
=
real_data
)
return
categorical_columns
,
continuous_columns
,
metadata
categorical_columns
,
continuous_columns
,
metadata
=
get_data_info
(
real_data
)
```
%% Cell type:code id:fc6db1ff-e2fc-4817-87d1-22743468b7b8 tags:
```
python
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator
=
TableEvaluator
(
real_data
,
synthetic_data
,
cat_cols
=
categorical_columns
)
table_evaluator
.
visual_evaluation
()
```
%% Cell type:code id:7bc79e6f-0232-4e2b-9b19-b2fd8825a687 tags:
```
python
#saving and visualizing column pair trend and column shapes
metadata
=
metadata
.
to_dict
()
my_report
=
QualityReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
RTVAE_Results/quality.pkl
'
)
my_report
.
get_visualization
(
property_name
=
'
Column Pair Trends
'
)
```
%% Cell type:code id:9332967a-f289-454a-9e20-ff73c7f9bbcf tags:
```
python
#saving and visualiztation data validity
my_report
=
DiagnosticReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
RTVAE_Results/diagnostic.pkl
'
)
my_report
.
get_visualization
(
'
Data Validity
'
)
```
%% Cell type:code id:ad3689d9-2361-4dfa-8efb-8d72d2859b7a tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment