TGAN

e6cc509d · Sayed Saeedi · 6d584a59 · e6cc509d
Commit e6cc509d authored 1 year ago by Sayed Saeedi
--- a/TGAN.ipynb
+++ b/TGAN.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fefebe4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#https://github.com/sdv-dev/TGAN/tree/master\n",
+    "import pandas as pd\n",
+    "from tgan.data import load_demo_data\n",
+    "from tgan.model import TGANModel\n",
+    "import tensorflow as tf\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a345cefc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#loading datasets\n",
+    "def load_data(location):\n",
+    "    data = pd.read_csv(location)\n",
+    "    data_columns = data.columns\n",
+    "    data = preprocessing(data)\n",
+    "    return data, data_columns\n",
+    "\n",
+    "\n",
+    "#Dataset preprocessing\n",
+    "def preprocessing(data):\n",
+    "    \n",
+    "    \"\"\"\n",
+    "    dropping duplicate values\n",
+    "    changing timeformat to d/m/Y H:M:S and then to Unix fomrat that starts from 1970/1/ 00:00:00\n",
+    "    making sure that numeric columns only have numeric values and if not numeric then to NaN\n",
+    "    dropping all NaN values    \n",
+    "    \"\"\"\n",
+    "    print(\"Shape of data before preprocessing:\", data.shape)\n",
+    "    data.drop_duplicates(inplace=True) #dropping duplicated\n",
+    "    data.replace([np.inf, -np.inf], np.nan, inplace=True)# changing inf and -inf to nan\n",
+    "\n",
+    "    data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n",
+    "    data['Timestamp'] = (data['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n",
+    "\n",
+    "    for col in data.columns: #changing columns to numeric if not, then to NaN\n",
+    "        if data[col].dtype == 'object' and col != 'Label':\n",
+    "            data[col] = pd.to_numeric(data[col], errors='coerce')\n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    data.dropna(inplace=True) #droping Na\n",
+    "    \n",
+    "    print(\"Shape of data after preprocessing:\", data.shape)\n",
+    "    \n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41dfaeff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tGAN(data, continuous_columns,max_epoch=5, steps_per_epoch=10000, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001, \n",
+    "         learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, \n",
+    "         optimizer='AdamOptimizer'):\n",
+    "    \n",
+    "    \"\"\"\n",
+    "    Required arguments to be passed:\n",
+    "        -data: dataframe with rows and columns\n",
+    "        -continuous_columns: a list containing all the columns that are continuous \n",
+    "    \"\"\"\n",
+    "\n",
+    "    print(data.shape)\n",
+    "\n",
+    "    tgan = TGANModel(continuous_columns=continuous_columns, max_epoch=max_epoch, steps_per_epoch=steps_per_epoch, \n",
+    "                     batch_size=batch_size, z_dim=z_dim, noise=noise, l2norm=l2norm, learning_rate=learning_rate, \n",
+    "                     num_gen_rnn=num_gen_rnn, num_gen_feature=num_gen_feature, num_dis_layers=num_dis_layers, \n",
+    "                     num_dis_hidden=num_dis_hidden, optimizer=optimizer)\n",
+    "    \n",
+    "    tgan.fit(data)\n",
+    "\n",
+    "    return tgan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07714e0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "continuous_columns = [2, 3, 17, 18, 20, 21, 38, 39]\n",
+    "\n",
+    "data, data_columns = load_data(\"C:\\\\Users\\\\sayed\\\\Desktop\\\\Dataset\\\\02-14-2018.csv\")\n",
+    "\n",
+    "data = data[data[\"Label\"] == \"FTP-BruteForce\"]\n",
+    "data.columns = [None] * len(data.columns) # revoming column names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95f316b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#fitting the TGAN model\n",
+    "tgan= tGAN(data=data, continuous_columns=continuous_columns, batch_size=150,  max_epoch = 15)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0b5a40e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Saving the model\n",
+    "model_path = 'C:\\\\Users\\\\sayed\\\\Desktop\\\\Dataset\\\\models\\\\tGAN_model_firstrun.pkl'\n",
+    "tgan.save(model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7343348",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "num_samples = 8000\n",
+    "new_tgan = TGANModel.load(model_path)\n",
+    "samples = new_tgan.sample(num_samples)\n",
+    "samples.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20667896",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#assinging back the column names\n",
+    "samples.columns = data_columns\n",
+    "data.columns = data_columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bdf2761",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8423e0de",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ae7fce1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2fd753c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06a0123c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebf1c5cd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:fefebe4b tags:
+
+``` python
+#https://github.com/sdv-dev/TGAN/tree/master
+import pandas as pd
+from tgan.data import load_demo_data
+from tgan.model import TGANModel
+import tensorflow as tf
+import numpy as np
+```
+
+%% Cell type:code id:a345cefc tags:
+
+``` python
+#loading datasets
+def load_data(location):
+    data = pd.read_csv(location)
+    data_columns = data.columns
+    data = preprocessing(data)
+    return data, data_columns
+
+
+#Dataset preprocessing
+def preprocessing(data):
+
+    """
+    dropping duplicate values
+    changing timeformat to d/m/Y H:M:S and then to Unix fomrat that starts from 1970/1/ 00:00:00
+    making sure that numeric columns only have numeric values and if not numeric then to NaN
+    dropping all NaN values
+    """
+    print("Shape of data before preprocessing:", data.shape)
+    data.drop_duplicates(inplace=True) #dropping duplicated
+    data.replace([np.inf, -np.inf], np.nan, inplace=True)# changing inf and -inf to nan
+
+    data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
+    data['Timestamp'] = (data['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
+
+    for col in data.columns: #changing columns to numeric if not, then to NaN
+        if data[col].dtype == 'object' and col != 'Label':
+            data[col] = pd.to_numeric(data[col], errors='coerce')
+
+
+
+    data.dropna(inplace=True) #droping Na
+
+    print("Shape of data after preprocessing:", data.shape)
+
+    return data
+```
+
+%% Cell type:code id:41dfaeff tags:
+
+``` python
+def tGAN(data, continuous_columns,max_epoch=5, steps_per_epoch=10000, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001,
+         learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100,
+         optimizer='AdamOptimizer'):
+
+    """
+    Required arguments to be passed:
+        -data: dataframe with rows and columns
+        -continuous_columns: a list containing all the columns that are continuous
+    """
+
+    print(data.shape)
+
+    tgan = TGANModel(continuous_columns=continuous_columns, max_epoch=max_epoch, steps_per_epoch=steps_per_epoch,
+                     batch_size=batch_size, z_dim=z_dim, noise=noise, l2norm=l2norm, learning_rate=learning_rate,
+                     num_gen_rnn=num_gen_rnn, num_gen_feature=num_gen_feature, num_dis_layers=num_dis_layers,
+                     num_dis_hidden=num_dis_hidden, optimizer=optimizer)
+
+    tgan.fit(data)
+
+    return tgan
+```
+
+%% Cell type:code id:07714e0c tags:
+
+``` python
+continuous_columns = [2, 3, 17, 18, 20, 21, 38, 39]
+
+data, data_columns = load_data("C:\\Users\\sayed\\Desktop\\Dataset\\02-14-2018.csv")
+
+data = data[data["Label"] == "FTP-BruteForce"]
+data.columns = [None] * len(data.columns) # revoming column names
+```
+
+%% Cell type:code id:95f316b5 tags:
+
+``` python
+#fitting the TGAN model
+tgan= tGAN(data=data, continuous_columns=continuous_columns, batch_size=150,  max_epoch = 15)
+```
+
+%% Cell type:code id:d0b5a40e tags:
+
+``` python
+#Saving the model
+model_path = 'C:\\Users\\sayed\\Desktop\\Dataset\\models\\tGAN_model_firstrun.pkl'
+tgan.save(model_path)
+```
+
+%% Cell type:code id:d7343348 tags:
+
+``` python
+num_samples = 8000
+new_tgan = TGANModel.load(model_path)
+samples = new_tgan.sample(num_samples)
+samples.head()
+```
+
+%% Cell type:code id:20667896 tags:
+
+``` python
+#assinging back the column names
+samples.columns = data_columns
+data.columns = data_columns
+```
+
+%% Cell type:code id:4bdf2761 tags:
+
+``` python
+```
+
+%% Cell type:code id:8423e0de tags:
+
+``` python
+```
+
+%% Cell type:code id:9ae7fce1 tags:
+
+``` python
+```
+
+%% Cell type:code id:d2fd753c tags:
+
+``` python
+```
+
+%% Cell type:code id:06a0123c tags:
+
+``` python
+```
+
+%% Cell type:code id:ebf1c5cd tags:
+
+``` python
+```