Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MasterThesis - AryanSaeedi
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Urmann
MasterThesis - AryanSaeedi
Commits
e6cc509d
Commit
e6cc509d
authored
1 year ago
by
Sayed Saeedi
Browse files
Options
Downloads
Patches
Plain Diff
TGAN
parent
6d584a59
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
TGAN.ipynb
+226
-0
226 additions, 0 deletions
TGAN.ipynb
with
226 additions
and
0 deletions
TGAN.ipynb
0 → 100644
+
226
−
0
View file @
e6cc509d
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "fefebe4b",
"metadata": {},
"outputs": [],
"source": [
"#https://github.com/sdv-dev/TGAN/tree/master\n",
"import pandas as pd\n",
"from tgan.data import load_demo_data\n",
"from tgan.model import TGANModel\n",
"import tensorflow as tf\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a345cefc",
"metadata": {},
"outputs": [],
"source": [
"#loading datasets\n",
"def load_data(location):\n",
" data = pd.read_csv(location)\n",
" data_columns = data.columns\n",
" data = preprocessing(data)\n",
" return data, data_columns\n",
"\n",
"\n",
"#Dataset preprocessing\n",
"def preprocessing(data):\n",
" \n",
" \"\"\"\n",
" dropping duplicate values\n",
" changing timeformat to d/m/Y H:M:S and then to Unix fomrat that starts from 1970/1/ 00:00:00\n",
" making sure that numeric columns only have numeric values and if not numeric then to NaN\n",
" dropping all NaN values \n",
" \"\"\"\n",
" print(\"Shape of data before preprocessing:\", data.shape)\n",
" data.drop_duplicates(inplace=True) #dropping duplicated\n",
" data.replace([np.inf, -np.inf], np.nan, inplace=True)# changing inf and -inf to nan\n",
"\n",
" data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n",
" data['Timestamp'] = (data['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n",
"\n",
" for col in data.columns: #changing columns to numeric if not, then to NaN\n",
" if data[col].dtype == 'object' and col != 'Label':\n",
" data[col] = pd.to_numeric(data[col], errors='coerce')\n",
" \n",
" \n",
" \n",
" data.dropna(inplace=True) #droping Na\n",
" \n",
" print(\"Shape of data after preprocessing:\", data.shape)\n",
" \n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41dfaeff",
"metadata": {},
"outputs": [],
"source": [
"def tGAN(data, continuous_columns,max_epoch=5, steps_per_epoch=10000, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001, \n",
" learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, \n",
" optimizer='AdamOptimizer'):\n",
" \n",
" \"\"\"\n",
" Required arguments to be passed:\n",
" -data: dataframe with rows and columns\n",
" -continuous_columns: a list containing all the columns that are continuous \n",
" \"\"\"\n",
"\n",
" print(data.shape)\n",
"\n",
" tgan = TGANModel(continuous_columns=continuous_columns, max_epoch=max_epoch, steps_per_epoch=steps_per_epoch, \n",
" batch_size=batch_size, z_dim=z_dim, noise=noise, l2norm=l2norm, learning_rate=learning_rate, \n",
" num_gen_rnn=num_gen_rnn, num_gen_feature=num_gen_feature, num_dis_layers=num_dis_layers, \n",
" num_dis_hidden=num_dis_hidden, optimizer=optimizer)\n",
" \n",
" tgan.fit(data)\n",
"\n",
" return tgan"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07714e0c",
"metadata": {},
"outputs": [],
"source": [
"continuous_columns = [2, 3, 17, 18, 20, 21, 38, 39]\n",
"\n",
"data, data_columns = load_data(\"C:\\\\Users\\\\sayed\\\\Desktop\\\\Dataset\\\\02-14-2018.csv\")\n",
"\n",
"data = data[data[\"Label\"] == \"FTP-BruteForce\"]\n",
"data.columns = [None] * len(data.columns) # revoming column names"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95f316b5",
"metadata": {},
"outputs": [],
"source": [
"#fitting the TGAN model\n",
"tgan= tGAN(data=data, continuous_columns=continuous_columns, batch_size=150, max_epoch = 15)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d0b5a40e",
"metadata": {},
"outputs": [],
"source": [
"#Saving the model\n",
"model_path = 'C:\\\\Users\\\\sayed\\\\Desktop\\\\Dataset\\\\models\\\\tGAN_model_firstrun.pkl'\n",
"tgan.save(model_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7343348",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"num_samples = 8000\n",
"new_tgan = TGANModel.load(model_path)\n",
"samples = new_tgan.sample(num_samples)\n",
"samples.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20667896",
"metadata": {},
"outputs": [],
"source": [
"#assinging back the column names\n",
"samples.columns = data_columns\n",
"data.columns = data_columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bdf2761",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "8423e0de",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ae7fce1",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2fd753c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "06a0123c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebf1c5cd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:fefebe4b tags:
```
python
#https://github.com/sdv-dev/TGAN/tree/master
import
pandas
as
pd
from
tgan.data
import
load_demo_data
from
tgan.model
import
TGANModel
import
tensorflow
as
tf
import
numpy
as
np
```
%% Cell type:code id:a345cefc tags:
```
python
#loading datasets
def
load_data
(
location
):
data
=
pd
.
read_csv
(
location
)
data_columns
=
data
.
columns
data
=
preprocessing
(
data
)
return
data
,
data_columns
#Dataset preprocessing
def
preprocessing
(
data
):
"""
dropping duplicate values
changing timeformat to d/m/Y H:M:S and then to Unix fomrat that starts from 1970/1/ 00:00:00
making sure that numeric columns only have numeric values and if not numeric then to NaN
dropping all NaN values
"""
print
(
"
Shape of data before preprocessing:
"
,
data
.
shape
)
data
.
drop_duplicates
(
inplace
=
True
)
#dropping duplicated
data
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
# changing inf and -inf to nan
data
[
'
Timestamp
'
]
=
pd
.
to_datetime
(
data
[
'
Timestamp
'
],
format
=
'
%d/%m/%Y %H:%M:%S
'
,
errors
=
'
coerce
'
)
data
[
'
Timestamp
'
]
=
(
data
[
'
Timestamp
'
]
-
pd
.
Timestamp
(
"
1970-01-01
"
))
//
pd
.
Timedelta
(
'
1s
'
)
for
col
in
data
.
columns
:
#changing columns to numeric if not, then to NaN
if
data
[
col
].
dtype
==
'
object
'
and
col
!=
'
Label
'
:
data
[
col
]
=
pd
.
to_numeric
(
data
[
col
],
errors
=
'
coerce
'
)
data
.
dropna
(
inplace
=
True
)
#droping Na
print
(
"
Shape of data after preprocessing:
"
,
data
.
shape
)
return
data
```
%% Cell type:code id:41dfaeff tags:
```
python
def
tGAN
(
data
,
continuous_columns
,
max_epoch
=
5
,
steps_per_epoch
=
10000
,
batch_size
=
200
,
z_dim
=
200
,
noise
=
0.2
,
l2norm
=
0.00001
,
learning_rate
=
0.001
,
num_gen_rnn
=
100
,
num_gen_feature
=
100
,
num_dis_layers
=
1
,
num_dis_hidden
=
100
,
optimizer
=
'
AdamOptimizer
'
):
"""
Required arguments to be passed:
-data: dataframe with rows and columns
-continuous_columns: a list containing all the columns that are continuous
"""
print
(
data
.
shape
)
tgan
=
TGANModel
(
continuous_columns
=
continuous_columns
,
max_epoch
=
max_epoch
,
steps_per_epoch
=
steps_per_epoch
,
batch_size
=
batch_size
,
z_dim
=
z_dim
,
noise
=
noise
,
l2norm
=
l2norm
,
learning_rate
=
learning_rate
,
num_gen_rnn
=
num_gen_rnn
,
num_gen_feature
=
num_gen_feature
,
num_dis_layers
=
num_dis_layers
,
num_dis_hidden
=
num_dis_hidden
,
optimizer
=
optimizer
)
tgan
.
fit
(
data
)
return
tgan
```
%% Cell type:code id:07714e0c tags:
```
python
continuous_columns
=
[
2
,
3
,
17
,
18
,
20
,
21
,
38
,
39
]
data
,
data_columns
=
load_data
(
"
C:
\\
Users
\\
sayed
\\
Desktop
\\
Dataset
\\
02-14-2018.csv
"
)
data
=
data
[
data
[
"
Label
"
]
==
"
FTP-BruteForce
"
]
data
.
columns
=
[
None
]
*
len
(
data
.
columns
)
# revoming column names
```
%% Cell type:code id:95f316b5 tags:
```
python
#fitting the TGAN model
tgan
=
tGAN
(
data
=
data
,
continuous_columns
=
continuous_columns
,
batch_size
=
150
,
max_epoch
=
15
)
```
%% Cell type:code id:d0b5a40e tags:
```
python
#Saving the model
model_path
=
'
C:
\\
Users
\\
sayed
\\
Desktop
\\
Dataset
\\
models
\\
tGAN_model_firstrun.pkl
'
tgan
.
save
(
model_path
)
```
%% Cell type:code id:d7343348 tags:
```
python
num_samples
=
8000
new_tgan
=
TGANModel
.
load
(
model_path
)
samples
=
new_tgan
.
sample
(
num_samples
)
samples
.
head
()
```
%% Cell type:code id:20667896 tags:
```
python
#assinging back the column names
samples
.
columns
=
data_columns
data
.
columns
=
data_columns
```
%% Cell type:code id:4bdf2761 tags:
```
python
``
`
%%
Cell
type
:
code
id
:
8423e0
de
tags
:
```
python
```
%% Cell type:code id:9ae7fce1 tags:
```
python
```
%% Cell type:code id:d2fd753c tags:
```
python
```
%% Cell type:code id:06a0123c tags:
```
python
```
%% Cell type:code id:ebf1c5cd tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment