diff --git a/TGAN.ipynb b/TGAN.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2a6038a592fb521e58da9fcbecb216ec9be822c1 --- /dev/null +++ b/TGAN.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fefebe4b", + "metadata": {}, + "outputs": [], + "source": [ + "#https://github.com/sdv-dev/TGAN/tree/master\n", + "import pandas as pd\n", + "from tgan.data import load_demo_data\n", + "from tgan.model import TGANModel\n", + "import tensorflow as tf\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a345cefc", + "metadata": {}, + "outputs": [], + "source": [ + "#loading datasets\n", + "def load_data(location):\n", + " data = pd.read_csv(location)\n", + " data_columns = data.columns\n", + " data = preprocessing(data)\n", + " return data, data_columns\n", + "\n", + "\n", + "#Dataset preprocessing\n", + "def preprocessing(data):\n", + " \n", + " \"\"\"\n", + " dropping duplicate values\n", + " changing timeformat to d/m/Y H:M:S and then to Unix fomrat that starts from 1970/1/ 00:00:00\n", + " making sure that numeric columns only have numeric values and if not numeric then to NaN\n", + " dropping all NaN values \n", + " \"\"\"\n", + " print(\"Shape of data before preprocessing:\", data.shape)\n", + " data.drop_duplicates(inplace=True) #dropping duplicated\n", + " data.replace([np.inf, -np.inf], np.nan, inplace=True)# changing inf and -inf to nan\n", + "\n", + " data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')\n", + " data['Timestamp'] = (data['Timestamp'] - pd.Timestamp(\"1970-01-01\")) // pd.Timedelta('1s') \n", + "\n", + " for col in data.columns: #changing columns to numeric if not, then to NaN\n", + " if data[col].dtype == 'object' and col != 'Label':\n", + " data[col] = pd.to_numeric(data[col], errors='coerce')\n", + " \n", + " \n", + " \n", + " data.dropna(inplace=True) #droping Na\n", + " \n", + " print(\"Shape of data after preprocessing:\", data.shape)\n", + " \n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41dfaeff", + "metadata": {}, + "outputs": [], + "source": [ + "def tGAN(data, continuous_columns,max_epoch=5, steps_per_epoch=10000, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001, \n", + " learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, \n", + " optimizer='AdamOptimizer'):\n", + " \n", + " \"\"\"\n", + " Required arguments to be passed:\n", + " -data: dataframe with rows and columns\n", + " -continuous_columns: a list containing all the columns that are continuous \n", + " \"\"\"\n", + "\n", + " print(data.shape)\n", + "\n", + " tgan = TGANModel(continuous_columns=continuous_columns, max_epoch=max_epoch, steps_per_epoch=steps_per_epoch, \n", + " batch_size=batch_size, z_dim=z_dim, noise=noise, l2norm=l2norm, learning_rate=learning_rate, \n", + " num_gen_rnn=num_gen_rnn, num_gen_feature=num_gen_feature, num_dis_layers=num_dis_layers, \n", + " num_dis_hidden=num_dis_hidden, optimizer=optimizer)\n", + " \n", + " tgan.fit(data)\n", + "\n", + " return tgan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07714e0c", + "metadata": {}, + "outputs": [], + "source": [ + "continuous_columns = [2, 3, 17, 18, 20, 21, 38, 39]\n", + "\n", + "data, data_columns = load_data(\"C:\\\\Users\\\\sayed\\\\Desktop\\\\Dataset\\\\02-14-2018.csv\")\n", + "\n", + "data = data[data[\"Label\"] == \"FTP-BruteForce\"]\n", + "data.columns = [None] * len(data.columns) # revoming column names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95f316b5", + "metadata": {}, + "outputs": [], + "source": [ + "#fitting the TGAN model\n", + "tgan= tGAN(data=data, continuous_columns=continuous_columns, batch_size=150, max_epoch = 15)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0b5a40e", + "metadata": {}, + "outputs": [], + "source": [ + "#Saving the model\n", + "model_path = 'C:\\\\Users\\\\sayed\\\\Desktop\\\\Dataset\\\\models\\\\tGAN_model_firstrun.pkl'\n", + "tgan.save(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7343348", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "num_samples = 8000\n", + "new_tgan = TGANModel.load(model_path)\n", + "samples = new_tgan.sample(num_samples)\n", + "samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20667896", + "metadata": {}, + "outputs": [], + "source": [ + "#assinging back the column names\n", + "samples.columns = data_columns\n", + "data.columns = data_columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bdf2761", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8423e0de", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ae7fce1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2fd753c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06a0123c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf1c5cd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}