Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MasterThesis - AryanSaeedi
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Urmann
MasterThesis - AryanSaeedi
Commits
9b2f5dff
Commit
9b2f5dff
authored
1 year ago
by
Sayed Saeedi
Browse files
Options
Downloads
Patches
Plain Diff
Code of TabFairGAN
parent
1a322564
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Models/TabFairGAN.ipynb
+649
-0
649 additions, 0 deletions
Models/TabFairGAN.ipynb
with
649 additions
and
0 deletions
Models/TabFairGAN.ipynb
0 → 100644
+
649
−
0
View file @
9b2f5dff
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0e235976-62c1-4d28-b7a8-bae9e4ab7c8d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sdv.metadata import SingleTableMetadata\n",
"from sdmetrics.reports.single_table import QualityReport\n",
"from sdmetrics.reports.single_table import DiagnosticReport\n",
"from table_evaluator import TableEvaluator\n",
"import matplotlib.pyplot as plt\n",
"from sdmetrics.single_column import StatisticSimilarity\n",
"import math\n",
"from sdmetrics.single_column import RangeCoverage\n",
"from sdmetrics.visualization import get_column_plot\n",
"import os\n",
"import plotly.io as py\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1822d860-da20-4bf3-942f-dab5b7ec6050",
"metadata": {},
"outputs": [],
"source": [
"#loading the preprocessed datasets \n",
"\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n",
"# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n",
"\n",
"print(real_data.shape)\n",
"print(real_data.Label.unique())\n",
"\n",
"# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly\n",
"#real_data=real_data[real_data.Label=='SSH-Bruteforce'] # change according to the dataset\n",
"real_data = real_data.iloc[:300000, :]\n",
"print(real_data.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e35c1b44-91c7-4b03-a381-b9092540a2bf",
"metadata": {},
"outputs": [],
"source": [
"# Manually set hyperparameters\n",
"class Args:\n",
" num_epochs = 700\n",
" batch_size = 100\n",
" fake_name = 'TabFairGAN_Results/test.csv' # location change accordingly\n",
" size_of_fake_data = 300000 # number instances to be generated\n",
"\n",
"args = Args()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1811d74a-b27d-4f1f-96a8-85fe54f2cc41",
"metadata": {},
"outputs": [],
"source": [
"#TabFairGAN code directly copied from GitHub repo\n",
"#https://github.com/amirarsalan90/TabFairGAN\n",
"\n",
"import torch\n",
"import torch.nn.functional as f\n",
"from torch import nn\n",
"import pandas as pd\n",
"import numpy as np\n",
"from collections import OrderedDict\n",
"\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.preprocessing import QuantileTransformer\n",
"from sklearn.model_selection import train_test_split\n",
"import argparse\n",
"\n",
"\n",
"'''parser = argparse.ArgumentParser()\n",
"subparser = parser.add_subparsers(dest='command')\n",
"with_fairness = subparser.add_parser('with_fairness')\n",
"no_fairness = subparser.add_parser('no_fairness')\n",
"\n",
"with_fairness.add_argument(\"df_name\", help=\"Reference dataframe\", type=str)\n",
"with_fairness.add_argument(\"S\", help=\"Protected attribute\", type=str)\n",
"with_fairness.add_argument(\"Y\", help=\"Label (decision)\", type=str)\n",
"with_fairness.add_argument(\"underprivileged_value\", help=\"Value for underpriviledged group\", type=str)\n",
"with_fairness.add_argument(\"desirable_value\", help=\"Desired label (decision)\", type=str)\n",
"with_fairness.add_argument(\"num_epochs\", help=\"Total number of epochs\", type=int)\n",
"with_fairness.add_argument(\"batch_size\", help=\"the batch size\", type=int)\n",
"with_fairness.add_argument(\"num_fair_epochs\", help=\"number of fair training epochs\", type=int)\n",
"with_fairness.add_argument(\"lambda_val\", help=\"lambda parameter\", type=float)\n",
"with_fairness.add_argument(\"fake_name\", help=\"name of the produced csv file\", type=str)\n",
"with_fairness.add_argument(\"size_of_fake_data\", help=\"how many data records to generate\", type=int)\n",
"\n",
"\n",
"no_fairness.add_argument(\"df_name\", help=\"Reference dataframe\", type=str)\n",
"no_fairness.add_argument(\"num_epochs\", help=\"Total number of epochs\", type=int)\n",
"no_fairness.add_argument(\"batch_size\", help=\"the batch size\", type=int)\n",
"no_fairness.add_argument(\"fake_name\", help=\"name of the produced csv file\", type=str)\n",
"no_fairness.add_argument(\"size_of_fake_data\", help=\"how many data records to generate\", type=int)\n",
"\n",
"args = parser.parse_args()'''\n",
"\n",
"if args.command == 'with_fairness':\n",
" S = args.S\n",
" Y = args.Y\n",
" S_under = args.underprivileged_value\n",
" Y_desire = args.desirable_value\n",
"\n",
" df = pd.read_csv(args.df_name)\n",
"\n",
" df[S] = df[S].astype(object)\n",
" df[Y] = df[Y].astype(object)\n",
"\n",
"elif args.command == 'no_fairness':\n",
" df = real_data\n",
"\n",
"\n",
"if args.command == \"with_fairness\":\n",
" def get_ohe_data(df):\n",
" df_int = df.select_dtypes(['float', 'integer']).values\n",
" continuous_columns_list = list(df.select_dtypes(['float', 'integer']).columns)\n",
" ##############################################################\n",
" scaler = QuantileTransformer(n_quantiles=2000, output_distribution='uniform')\n",
" df_int = scaler.fit_transform(df_int)\n",
"\n",
" df_cat = df.select_dtypes('object')\n",
" df_cat_names = list(df.select_dtypes('object').columns)\n",
" numerical_array = df_int\n",
" ohe = OneHotEncoder()\n",
" ohe_array = ohe.fit_transform(df_cat)\n",
"\n",
" cat_lens = [i.shape[0] for i in ohe.categories_]\n",
" discrete_columns_ordereddict = OrderedDict(zip(df_cat_names, cat_lens))\n",
"\n",
" S_start_index = len(continuous_columns_list) + sum(\n",
" list(discrete_columns_ordereddict.values())[:list(discrete_columns_ordereddict.keys()).index(S)])\n",
" Y_start_index = len(continuous_columns_list) + sum(\n",
" list(discrete_columns_ordereddict.values())[:list(discrete_columns_ordereddict.keys()).index(Y)])\n",
"\n",
" if ohe.categories_[list(discrete_columns_ordereddict.keys()).index(S)][0] == S_under:\n",
" underpriv_index = 0\n",
" priv_index = 1\n",
" else:\n",
" underpriv_index = 1\n",
" priv_index = 0\n",
" if ohe.categories_[list(discrete_columns_ordereddict.keys()).index(Y)][0] == Y_desire:\n",
" desire_index = 0\n",
" undesire_index = 1\n",
" else:\n",
" desire_index = 1\n",
" undesire_index = 0\n",
"\n",
" final_array = np.hstack((numerical_array, ohe_array.toarray()))\n",
" return ohe, scaler, discrete_columns_ordereddict, continuous_columns_list, final_array, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index\n",
"\n",
"elif args.command == \"no_fairness\":\n",
" def get_ohe_data(df):\n",
" df_int = df.select_dtypes(['float', 'integer']).values\n",
" continuous_columns_list = list(df.select_dtypes(['float', 'integer']).columns)\n",
" ##############################################################\n",
" scaler = QuantileTransformer(n_quantiles=2000, output_distribution='uniform')\n",
" df_int = scaler.fit_transform(df_int)\n",
"\n",
" df_cat = df.select_dtypes('object')\n",
" df_cat_names = list(df.select_dtypes('object').columns)\n",
" numerical_array = df_int\n",
" ohe = OneHotEncoder()\n",
" ohe_array = ohe.fit_transform(df_cat)\n",
"\n",
" cat_lens = [i.shape[0] for i in ohe.categories_]\n",
" discrete_columns_ordereddict = OrderedDict(zip(df_cat_names, cat_lens))\n",
"\n",
"\n",
" final_array = np.hstack((numerical_array, ohe_array.toarray()))\n",
" return ohe, scaler, discrete_columns_ordereddict, continuous_columns_list, final_array\n",
"\n",
"\n",
"def get_original_data(df_transformed, df_orig, ohe, scaler):\n",
" df_ohe_int = df_transformed[:, :df_orig.select_dtypes(['float', 'integer']).shape[1]]\n",
" df_ohe_int = scaler.inverse_transform(df_ohe_int)\n",
" df_ohe_cats = df_transformed[:, df_orig.select_dtypes(['float', 'integer']).shape[1]:]\n",
" df_ohe_cats = ohe.inverse_transform(df_ohe_cats)\n",
" df_int = pd.DataFrame(df_ohe_int, columns=df_orig.select_dtypes(['float', 'integer']).columns)\n",
" df_cat = pd.DataFrame(df_ohe_cats, columns=df_orig.select_dtypes('object').columns)\n",
" return pd.concat([df_int, df_cat], axis=1)\n",
"\n",
"\n",
"if args.command == \"with_fairness\":\n",
" def prepare_data(df, batch_size):\n",
" ohe, scaler, discrete_columns, continuous_columns, df_transformed, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index = get_ohe_data(df)\n",
" input_dim = 100\n",
" X_train, X_test = train_test_split(df_transformed,test_size=0.1, shuffle=True)\n",
" data_train = X_train.copy()\n",
" data_test = X_test.copy()\n",
"\n",
" from torch.utils.data import TensorDataset\n",
" from torch.utils.data import DataLoader\n",
" data = torch.from_numpy(data_train).float()\n",
"\n",
"\n",
" train_ds = TensorDataset(data)\n",
" train_dl = DataLoader(train_ds, batch_size = batch_size, drop_last=True)\n",
" return ohe, scaler, input_dim, discrete_columns, continuous_columns ,train_dl, data_train, data_test, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index\n",
"\n",
"elif args.command == \"no_fairness\":\n",
" def prepare_data(df, batch_size):\n",
" #df = pd.concat([df_train, df_test], axis=0)\n",
"\n",
" ohe, scaler, discrete_columns, continuous_columns, df_transformed = get_ohe_data(df)\n",
"\n",
"\n",
" input_dim = df_transformed.shape[1]\n",
"\n",
" #from sklearn.model_selection import train_test_split\n",
" #################\n",
" X_train, X_test = train_test_split(df_transformed,test_size=0.1, shuffle=True) #random_state=10)\n",
" #X_train = df_transformed[:df_train.shape[0],:]\n",
" #X_test = df_transformed[df_train.shape[0]:,:]\n",
"\n",
" data_train = X_train.copy()\n",
" data_test = X_test.copy()\n",
"\n",
" from torch.utils.data import TensorDataset\n",
" from torch.utils.data import DataLoader\n",
" data = torch.from_numpy(data_train).float()\n",
"\n",
"\n",
" train_ds = TensorDataset(data)\n",
" train_dl = DataLoader(train_ds, batch_size = batch_size, drop_last=True)\n",
" return ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test\n",
"\n",
"\n",
"\n",
"class Generator(nn.Module):\n",
" def __init__(self, input_dim, continuous_columns, discrete_columns):\n",
" super(Generator, self).__init__()\n",
" self._input_dim = input_dim\n",
" self._discrete_columns = discrete_columns\n",
" self._num_continuous_columns = len(continuous_columns)\n",
"\n",
" self.lin1 = nn.Linear(self._input_dim, self._input_dim)\n",
" self.lin_numerical = nn.Linear(self._input_dim, self._num_continuous_columns)\n",
"\n",
" self.lin_cat = nn.ModuleDict()\n",
" for key, value in self._discrete_columns.items():\n",
" self.lin_cat[key] = nn.Linear(self._input_dim, value)\n",
"\n",
" def forward(self, x):\n",
" x = torch.relu(self.lin1(x))\n",
" # x = f.leaky_relu(self.lin1(x))\n",
" # x_numerical = f.leaky_relu(self.lin_numerical(x))\n",
" x_numerical = f.relu(self.lin_numerical(x))\n",
" x_cat = []\n",
" for key in self.lin_cat:\n",
" x_cat.append(f.gumbel_softmax(self.lin_cat[key](x), tau=0.2))\n",
" x_final = torch.cat((x_numerical, *x_cat), 1)\n",
" return x_final\n",
"\n",
"\n",
"class Critic(nn.Module):\n",
" def __init__(self, input_dim):\n",
" super(Critic, self).__init__()\n",
" self._input_dim = input_dim\n",
" # self.dense1 = nn.Linear(109, 256)\n",
" self.dense1 = nn.Linear(self._input_dim, self._input_dim)\n",
" self.dense2 = nn.Linear(self._input_dim, self._input_dim)\n",
" # self.dense3 = nn.Linear(256, 1)\n",
" # self.drop = nn.Dropout(p=0.2)\n",
" # self.activation = nn.Sigmoid()\n",
"\n",
" def forward(self, x):\n",
" x = f.leaky_relu(self.dense1(x))\n",
" # x = self.drop(x)\n",
" # x = f.leaky_relu(self.dense2(x))\n",
" x = f.leaky_relu(self.dense2(x))\n",
" # x = self.drop(x)\n",
" return x\n",
"\n",
"\n",
"class FairLossFunc(nn.Module):\n",
" def __init__(self, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index):\n",
" super(FairLossFunc, self).__init__()\n",
" self._S_start_index = S_start_index\n",
" self._Y_start_index = Y_start_index\n",
" self._underpriv_index = underpriv_index\n",
" self._priv_index = priv_index\n",
" self._undesire_index = undesire_index\n",
" self._desire_index = desire_index\n",
"\n",
" def forward(self, x, crit_fake_pred, lamda):\n",
" G = x[:, self._S_start_index:self._S_start_index + 2]\n",
" # print(x[0,64])\n",
" I = x[:, self._Y_start_index:self._Y_start_index + 2]\n",
" # disp = (torch.mean(G[:,1]*I[:,1])/(x[:,65].sum())) - (torch.mean(G[:,0]*I[:,0])/(x[:,64].sum()))\n",
" # disp = -1.0 * torch.tanh(torch.mean(G[:,0]*I[:,1])/(x[:,64].sum()) - torch.mean(G[:,1]*I[:,1])/(x[:,65].sum()))\n",
" # gen_loss = -1.0 * torch.mean(crit_fake_pred)\n",
" disp = -1.0 * lamda * (torch.mean(G[:, self._underpriv_index] * I[:, self._desire_index]) / (\n",
" x[:, self._S_start_index + self._underpriv_index].sum()) - torch.mean(\n",
" G[:, self._priv_index] * I[:, self._desire_index]) / (\n",
" x[:, self._S_start_index + self._priv_index].sum())) - 1.0 * torch.mean(\n",
" crit_fake_pred)\n",
" # print(disp)\n",
" return disp\n",
"\n",
"device = torch.device(\"cuda:0\" if (torch.cuda.is_available() and 1 > 0) else \"cpu\")\n",
"\n",
"def get_gradient(crit, real, fake, epsilon):\n",
" mixed_data = real * epsilon + fake * (1 - epsilon)\n",
"\n",
" mixed_scores = crit(mixed_data)\n",
"\n",
" gradient = torch.autograd.grad(\n",
" inputs=mixed_data,\n",
" outputs=mixed_scores,\n",
" grad_outputs=torch.ones_like(mixed_scores),\n",
" create_graph=True,\n",
" retain_graph=True,\n",
" )[0]\n",
" return gradient\n",
"\n",
"def gradient_penalty(gradient):\n",
" gradient = gradient.view(len(gradient), -1)\n",
" gradient_norm = gradient.norm(2, dim=1)\n",
"\n",
" penalty = torch.mean((gradient_norm - 1) ** 2)\n",
" return penalty\n",
"\n",
"\n",
"def get_gen_loss(crit_fake_pred):\n",
" gen_loss = -1. * torch.mean(crit_fake_pred)\n",
"\n",
" return gen_loss\n",
"\n",
"\n",
"def get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda):\n",
" crit_loss = torch.mean(crit_fake_pred) - torch.mean(crit_real_pred) + c_lambda * gp\n",
"\n",
" return crit_loss\n",
"\n",
"\n",
"display_step = 50\n",
"\n",
"\n",
"def train(df, epochs=500, batch_size=64, fair_epochs=10, lamda=0.5):\n",
" if args.command == \"with_fairness\":\n",
" ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index = prepare_data(df, batch_size)\n",
" elif args.command == \"no_fairness\":\n",
" ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test = prepare_data(df, batch_size)\n",
"\n",
" generator = Generator(input_dim, continuous_columns, discrete_columns).to(device)\n",
" critic = Critic(input_dim).to(device)\n",
" if args.command == \"with_fairness\":\n",
" second_critic = FairLossFunc(S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index).to(device)\n",
"\n",
" gen_optimizer = torch.optim.Adam(generator.parameters(), lr=0.00001, betas=(0.5, 0.999)) # original lr=0.002, betas=(0.5, 0.999))\n",
" gen_optimizer_fair = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))\n",
" crit_optimizer = torch.optim.Adam(critic.parameters(), lr=0.00001, betas=(0.5, 0.999)) # original lr=0.002, betas=(0.5, 0.999))\n",
"\n",
" # loss = nn.BCELoss()\n",
" critic_losses = []\n",
" cur_step = 0\n",
" for i in range(epochs):\n",
" # j = 0\n",
" print(\"epoch {}\".format(i + 1))\n",
" ############################\n",
" if i + 1 <= (epochs - fair_epochs):\n",
" print(\"training for accuracy\")\n",
" if i + 1 > (epochs - fair_epochs):\n",
" print(\"training for fairness\")\n",
" for data in train_dl:\n",
" data[0] = data[0].to(device)\n",
" crit_repeat = 4\n",
" mean_iteration_critic_loss = 0\n",
" for k in range(crit_repeat):\n",
" # training the critic\n",
" crit_optimizer.zero_grad()\n",
" fake_noise = torch.randn(size=(batch_size, input_dim), device=device).float()\n",
" fake = generator(fake_noise)\n",
"\n",
" crit_fake_pred = critic(fake.detach())\n",
" crit_real_pred = critic(data[0])\n",
"\n",
" epsilon = torch.rand(batch_size, input_dim, device=device, requires_grad=True)\n",
" gradient = get_gradient(critic, data[0], fake.detach(), epsilon)\n",
" gp = gradient_penalty(gradient)\n",
"\n",
" crit_loss = get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda=10) #original c_lambda= 10\n",
"\n",
" mean_iteration_critic_loss += crit_loss.item() / crit_repeat\n",
" crit_loss.backward(retain_graph=True)\n",
" crit_optimizer.step()\n",
" #############################\n",
" if cur_step > 50:\n",
" critic_losses += [mean_iteration_critic_loss]\n",
"\n",
" #############################\n",
" if i + 1 <= (epochs - fair_epochs):\n",
" # training the generator for accuracy\n",
" gen_optimizer.zero_grad()\n",
" fake_noise_2 = torch.randn(size=(batch_size, input_dim), device=device).float()\n",
" fake_2 = generator(fake_noise_2)\n",
" crit_fake_pred = critic(fake_2)\n",
"\n",
" gen_loss = get_gen_loss(crit_fake_pred)\n",
" gen_loss.backward()\n",
"\n",
" # Update the weights\n",
" gen_optimizer.step()\n",
"\n",
" ###############################\n",
" if i + 1 > (epochs - fair_epochs):\n",
" # training the generator for fairness\n",
" gen_optimizer_fair.zero_grad()\n",
" fake_noise_2 = torch.randn(size=(batch_size, input_dim), device=device).float()\n",
" fake_2 = generator(fake_noise_2)\n",
"\n",
" crit_fake_pred = critic(fake_2)\n",
"\n",
" gen_fair_loss = second_critic(fake_2, crit_fake_pred, lamda)\n",
" gen_fair_loss.backward()\n",
" gen_optimizer_fair.step()\n",
" cur_step += 1\n",
"\n",
" return generator, critic, ohe, scaler, data_train, data_test, input_dim\n",
"\n",
"\n",
"def train_plot(df, epochs, batchsize, fair_epochs, lamda):\n",
" generator, critic, ohe, scaler, data_train, data_test, input_dim = train(df, epochs, batchsize, fair_epochs, lamda)\n",
" return generator, critic, ohe, scaler, data_train, data_test, input_dim\n",
"\n",
"\n",
"if args.command == \"with_fairness\":\n",
" generator, critic, ohe, scaler, data_train, data_test, input_dim = train_plot(df, args.num_epochs, args.batch_size, args.num_fair_epochs, args.lambda_val)\n",
"elif args.command == \"no_fairness\":\n",
" generator, critic, ohe, scaler, data_train, data_test, input_dim = train_plot(df, args.num_epochs, args.batch_size, 0, 0)\n",
"fake_numpy_array = generator(torch.randn(size=(args.size_of_fake_data, input_dim), device=device)).cpu().detach().numpy()\n",
"fake_df = get_original_data(fake_numpy_array, df, ohe, scaler)\n",
"fake_df = fake_df[df.columns]\n",
"fake_df.to_csv(args.fake_name, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ae0ef89-6190-4f33-927f-3f1e79206786",
"metadata": {},
"outputs": [],
"source": [
"#save file\n",
"synthetic_data = pd.read_csv('TabFairGAN_Results/test.csv') # change loation accordingly"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf298876-d496-45ed-b083-01bb162dec27",
"metadata": {},
"outputs": [],
"source": [
"def get_data_info(df):\n",
" \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n",
"\n",
" Args:\n",
" df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n",
"\n",
" Returns:\n",
" list: the list of categorical column names. Specifically, columns with only 4 uniques values\n",
" list: The list of continuous column names.\n",
" metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n",
" \"\"\"\n",
" #createing \n",
" categorical_columns = ['Label']\n",
" continuous_columns = []\n",
" for i in df.columns:\n",
" if i not in categorical_columns:\n",
" continuous_columns.append(i)\n",
" \n",
" #creating metadat\n",
" metadata = SingleTableMetadata()\n",
" metadata.detect_from_dataframe(df)\n",
" \n",
" for column in categorical_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'categorical'\n",
" )\n",
" \n",
" for column in continuous_columns:\n",
" metadata.update_column(\n",
" column_name = column,\n",
" sdtype = 'numerical' \n",
" )\n",
" # validating metadata\n",
" metadata.validate()\n",
" metadata.validate_data(data=real_data)\n",
" \n",
" return categorical_columns, continuous_columns, metadata\n",
"\n",
"\n",
"categorical_columns, continuous_columns, metadata = get_data_info(real_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "04e09b22-c681-48ea-b86c-44a643400ebc",
"metadata": {},
"outputs": [],
"source": [
"# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n",
"table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n",
"table_evaluator.visual_evaluation()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed1cd643-0342-4bfb-a217-443337684a38",
"metadata": {},
"outputs": [],
"source": [
"#saving and visualizing column pair trend and column shapes\n",
"metadata = metadata.to_dict()\n",
"my_report = QualityReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='TabFairGAN_Results/Bot/quality.pkl')\n",
"my_report.get_visualization(property_name='Column Pair Trends')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a1ed789-0cb8-43c3-b4d6-4b842093717c",
"metadata": {},
"outputs": [],
"source": [
"#saving and visualiztation data validity\n",
"my_report = DiagnosticReport()\n",
"my_report.generate(real_data, synthetic_data, metadata)\n",
"my_report.save(filepath='TabFairGAN_Results/Bot/diagnostic.pkl')\n",
"my_report.get_visualization('Data Validity')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19b801e3-96cc-474f-971c-cc43990be18d",
"metadata": {},
"outputs": [],
"source": [
"#range coverage metric\n",
"range_coverage=[]\n",
"for i in real_data.columns:\n",
" \n",
" y=RangeCoverage.compute(\n",
" real_data=real_data[i],\n",
" synthetic_data=synthetic_data[i]\n",
" )\n",
" range_coverage.append(y)\n",
"df = pd.DataFrame(range_coverage, columns=['Range Coverage'])\n",
"\n",
"print(df['Range Coverage'].mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f5fd8e3-8d50-43dc-a1f8-783dae4ae7da",
"metadata": {},
"outputs": [],
"source": [
"# checking the number of unique synthetic data instances\n",
"df = pd.concat([real_data, synthetic_data], axis=0)\n",
"print(df.shape)\n",
"df.dropna(inplace=True)\n",
"df.drop_duplicates(inplace=True)\n",
"print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08386892-ce3f-45f8-be6e-11f184572a80",
"metadata": {},
"outputs": [],
"source": [
"#Saving the distribution of each column\n",
"def sanitize_column_name(column_name):\n",
" valid_chars = \"-_.() %s%s\" % (string.ascii_letters, string.digits)\n",
" return ''.join(c for c in column_name if c in valid_chars)\n",
"\n",
"for i in real_data.columns:\n",
" fig = get_column_plot(\n",
" real_data=real_data,\n",
" synthetic_data=synthetic_data,\n",
" column_name=i,\n",
" plot_type='bar'\n",
" )\n",
"\n",
" sanitized_column_name = sanitize_column_name(i)\n",
"\n",
" # Save the figure in the 'Pics' directory, change the location accordingly\n",
" py.write_image(fig, os.path.join('TabFairGAN_Results/Bot/Pics', f\"{sanitized_column_name}.png\"))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff374e9c-1e80-4804-be56-c25e2ca2bc3a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:0e235976-62c1-4d28-b7a8-bae9e4ab7c8d tags:
```
python
import
pandas
as
pd
import
numpy
as
np
from
sdv.metadata
import
SingleTableMetadata
from
sdmetrics.reports.single_table
import
QualityReport
from
sdmetrics.reports.single_table
import
DiagnosticReport
from
table_evaluator
import
TableEvaluator
import
matplotlib.pyplot
as
plt
from
sdmetrics.single_column
import
StatisticSimilarity
import
math
from
sdmetrics.single_column
import
RangeCoverage
from
sdmetrics.visualization
import
get_column_plot
import
os
import
plotly.io
as
py
import
string
```
%% Cell type:code id:1822d860-da20-4bf3-942f-dab5b7ec6050 tags:
```
python
#loading the preprocessed datasets
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')
print
(
real_data
.
shape
)
print
(
real_data
.
Label
.
unique
())
# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly
#real_data=real_data[real_data.Label=='SSH-Bruteforce'] # change according to the dataset
real_data
=
real_data
.
iloc
[:
300000
,
:]
print
(
real_data
.
shape
)
```
%% Cell type:code id:e35c1b44-91c7-4b03-a381-b9092540a2bf tags:
```
python
# Manually set hyperparameters
class
Args
:
num_epochs
=
700
batch_size
=
100
fake_name
=
'
TabFairGAN_Results/test.csv
'
# location change accordingly
size_of_fake_data
=
300000
# number instances to be generated
args
=
Args
()
```
%% Cell type:code id:1811d74a-b27d-4f1f-96a8-85fe54f2cc41 tags:
```
python
#TabFairGAN code directly copied from GitHub repo
#https://github.com/amirarsalan90/TabFairGAN
import
torch
import
torch.nn.functional
as
f
from
torch
import
nn
import
pandas
as
pd
import
numpy
as
np
from
collections
import
OrderedDict
from
sklearn.preprocessing
import
OneHotEncoder
from
sklearn.preprocessing
import
QuantileTransformer
from
sklearn.model_selection
import
train_test_split
import
argparse
'''
parser = argparse.ArgumentParser()
subparser = parser.add_subparsers(dest=
'
command
'
)
with_fairness = subparser.add_parser(
'
with_fairness
'
)
no_fairness = subparser.add_parser(
'
no_fairness
'
)
with_fairness.add_argument(
"
df_name
"
, help=
"
Reference dataframe
"
, type=str)
with_fairness.add_argument(
"
S
"
, help=
"
Protected attribute
"
, type=str)
with_fairness.add_argument(
"
Y
"
, help=
"
Label (decision)
"
, type=str)
with_fairness.add_argument(
"
underprivileged_value
"
, help=
"
Value for underpriviledged group
"
, type=str)
with_fairness.add_argument(
"
desirable_value
"
, help=
"
Desired label (decision)
"
, type=str)
with_fairness.add_argument(
"
num_epochs
"
, help=
"
Total number of epochs
"
, type=int)
with_fairness.add_argument(
"
batch_size
"
, help=
"
the batch size
"
, type=int)
with_fairness.add_argument(
"
num_fair_epochs
"
, help=
"
number of fair training epochs
"
, type=int)
with_fairness.add_argument(
"
lambda_val
"
, help=
"
lambda parameter
"
, type=float)
with_fairness.add_argument(
"
fake_name
"
, help=
"
name of the produced csv file
"
, type=str)
with_fairness.add_argument(
"
size_of_fake_data
"
, help=
"
how many data records to generate
"
, type=int)
no_fairness.add_argument(
"
df_name
"
, help=
"
Reference dataframe
"
, type=str)
no_fairness.add_argument(
"
num_epochs
"
, help=
"
Total number of epochs
"
, type=int)
no_fairness.add_argument(
"
batch_size
"
, help=
"
the batch size
"
, type=int)
no_fairness.add_argument(
"
fake_name
"
, help=
"
name of the produced csv file
"
, type=str)
no_fairness.add_argument(
"
size_of_fake_data
"
, help=
"
how many data records to generate
"
, type=int)
args = parser.parse_args()
'''
if
args
.
command
==
'
with_fairness
'
:
S
=
args
.
S
Y
=
args
.
Y
S_under
=
args
.
underprivileged_value
Y_desire
=
args
.
desirable_value
df
=
pd
.
read_csv
(
args
.
df_name
)
df
[
S
]
=
df
[
S
].
astype
(
object
)
df
[
Y
]
=
df
[
Y
].
astype
(
object
)
elif
args
.
command
==
'
no_fairness
'
:
df
=
real_data
if
args
.
command
==
"
with_fairness
"
:
def
get_ohe_data
(
df
):
df_int
=
df
.
select_dtypes
([
'
float
'
,
'
integer
'
]).
values
continuous_columns_list
=
list
(
df
.
select_dtypes
([
'
float
'
,
'
integer
'
]).
columns
)
##############################################################
scaler
=
QuantileTransformer
(
n_quantiles
=
2000
,
output_distribution
=
'
uniform
'
)
df_int
=
scaler
.
fit_transform
(
df_int
)
df_cat
=
df
.
select_dtypes
(
'
object
'
)
df_cat_names
=
list
(
df
.
select_dtypes
(
'
object
'
).
columns
)
numerical_array
=
df_int
ohe
=
OneHotEncoder
()
ohe_array
=
ohe
.
fit_transform
(
df_cat
)
cat_lens
=
[
i
.
shape
[
0
]
for
i
in
ohe
.
categories_
]
discrete_columns_ordereddict
=
OrderedDict
(
zip
(
df_cat_names
,
cat_lens
))
S_start_index
=
len
(
continuous_columns_list
)
+
sum
(
list
(
discrete_columns_ordereddict
.
values
())[:
list
(
discrete_columns_ordereddict
.
keys
()).
index
(
S
)])
Y_start_index
=
len
(
continuous_columns_list
)
+
sum
(
list
(
discrete_columns_ordereddict
.
values
())[:
list
(
discrete_columns_ordereddict
.
keys
()).
index
(
Y
)])
if
ohe
.
categories_
[
list
(
discrete_columns_ordereddict
.
keys
()).
index
(
S
)][
0
]
==
S_under
:
underpriv_index
=
0
priv_index
=
1
else
:
underpriv_index
=
1
priv_index
=
0
if
ohe
.
categories_
[
list
(
discrete_columns_ordereddict
.
keys
()).
index
(
Y
)][
0
]
==
Y_desire
:
desire_index
=
0
undesire_index
=
1
else
:
desire_index
=
1
undesire_index
=
0
final_array
=
np
.
hstack
((
numerical_array
,
ohe_array
.
toarray
()))
return
ohe
,
scaler
,
discrete_columns_ordereddict
,
continuous_columns_list
,
final_array
,
S_start_index
,
Y_start_index
,
underpriv_index
,
priv_index
,
undesire_index
,
desire_index
elif
args
.
command
==
"
no_fairness
"
:
def
get_ohe_data
(
df
):
df_int
=
df
.
select_dtypes
([
'
float
'
,
'
integer
'
]).
values
continuous_columns_list
=
list
(
df
.
select_dtypes
([
'
float
'
,
'
integer
'
]).
columns
)
##############################################################
scaler
=
QuantileTransformer
(
n_quantiles
=
2000
,
output_distribution
=
'
uniform
'
)
df_int
=
scaler
.
fit_transform
(
df_int
)
df_cat
=
df
.
select_dtypes
(
'
object
'
)
df_cat_names
=
list
(
df
.
select_dtypes
(
'
object
'
).
columns
)
numerical_array
=
df_int
ohe
=
OneHotEncoder
()
ohe_array
=
ohe
.
fit_transform
(
df_cat
)
cat_lens
=
[
i
.
shape
[
0
]
for
i
in
ohe
.
categories_
]
discrete_columns_ordereddict
=
OrderedDict
(
zip
(
df_cat_names
,
cat_lens
))
final_array
=
np
.
hstack
((
numerical_array
,
ohe_array
.
toarray
()))
return
ohe
,
scaler
,
discrete_columns_ordereddict
,
continuous_columns_list
,
final_array
def
get_original_data
(
df_transformed
,
df_orig
,
ohe
,
scaler
):
df_ohe_int
=
df_transformed
[:,
:
df_orig
.
select_dtypes
([
'
float
'
,
'
integer
'
]).
shape
[
1
]]
df_ohe_int
=
scaler
.
inverse_transform
(
df_ohe_int
)
df_ohe_cats
=
df_transformed
[:,
df_orig
.
select_dtypes
([
'
float
'
,
'
integer
'
]).
shape
[
1
]:]
df_ohe_cats
=
ohe
.
inverse_transform
(
df_ohe_cats
)
df_int
=
pd
.
DataFrame
(
df_ohe_int
,
columns
=
df_orig
.
select_dtypes
([
'
float
'
,
'
integer
'
]).
columns
)
df_cat
=
pd
.
DataFrame
(
df_ohe_cats
,
columns
=
df_orig
.
select_dtypes
(
'
object
'
).
columns
)
return
pd
.
concat
([
df_int
,
df_cat
],
axis
=
1
)
if
args
.
command
==
"
with_fairness
"
:
def
prepare_data
(
df
,
batch_size
):
ohe
,
scaler
,
discrete_columns
,
continuous_columns
,
df_transformed
,
S_start_index
,
Y_start_index
,
underpriv_index
,
priv_index
,
undesire_index
,
desire_index
=
get_ohe_data
(
df
)
input_dim
=
100
X_train
,
X_test
=
train_test_split
(
df_transformed
,
test_size
=
0.1
,
shuffle
=
True
)
data_train
=
X_train
.
copy
()
data_test
=
X_test
.
copy
()
from
torch.utils.data
import
TensorDataset
from
torch.utils.data
import
DataLoader
data
=
torch
.
from_numpy
(
data_train
).
float
()
train_ds
=
TensorDataset
(
data
)
train_dl
=
DataLoader
(
train_ds
,
batch_size
=
batch_size
,
drop_last
=
True
)
return
ohe
,
scaler
,
input_dim
,
discrete_columns
,
continuous_columns
,
train_dl
,
data_train
,
data_test
,
S_start_index
,
Y_start_index
,
underpriv_index
,
priv_index
,
undesire_index
,
desire_index
elif
args
.
command
==
"
no_fairness
"
:
def
prepare_data
(
df
,
batch_size
):
#df = pd.concat([df_train, df_test], axis=0)
ohe
,
scaler
,
discrete_columns
,
continuous_columns
,
df_transformed
=
get_ohe_data
(
df
)
input_dim
=
df_transformed
.
shape
[
1
]
#from sklearn.model_selection import train_test_split
#################
X_train
,
X_test
=
train_test_split
(
df_transformed
,
test_size
=
0.1
,
shuffle
=
True
)
#random_state=10)
#X_train = df_transformed[:df_train.shape[0],:]
#X_test = df_transformed[df_train.shape[0]:,:]
data_train
=
X_train
.
copy
()
data_test
=
X_test
.
copy
()
from
torch.utils.data
import
TensorDataset
from
torch.utils.data
import
DataLoader
data
=
torch
.
from_numpy
(
data_train
).
float
()
train_ds
=
TensorDataset
(
data
)
train_dl
=
DataLoader
(
train_ds
,
batch_size
=
batch_size
,
drop_last
=
True
)
return
ohe
,
scaler
,
input_dim
,
discrete_columns
,
continuous_columns
,
train_dl
,
data_train
,
data_test
class
Generator
(
nn
.
Module
):
def
__init__
(
self
,
input_dim
,
continuous_columns
,
discrete_columns
):
super
(
Generator
,
self
).
__init__
()
self
.
_input_dim
=
input_dim
self
.
_discrete_columns
=
discrete_columns
self
.
_num_continuous_columns
=
len
(
continuous_columns
)
self
.
lin1
=
nn
.
Linear
(
self
.
_input_dim
,
self
.
_input_dim
)
self
.
lin_numerical
=
nn
.
Linear
(
self
.
_input_dim
,
self
.
_num_continuous_columns
)
self
.
lin_cat
=
nn
.
ModuleDict
()
for
key
,
value
in
self
.
_discrete_columns
.
items
():
self
.
lin_cat
[
key
]
=
nn
.
Linear
(
self
.
_input_dim
,
value
)
def
forward
(
self
,
x
):
x
=
torch
.
relu
(
self
.
lin1
(
x
))
# x = f.leaky_relu(self.lin1(x))
# x_numerical = f.leaky_relu(self.lin_numerical(x))
x_numerical
=
f
.
relu
(
self
.
lin_numerical
(
x
))
x_cat
=
[]
for
key
in
self
.
lin_cat
:
x_cat
.
append
(
f
.
gumbel_softmax
(
self
.
lin_cat
[
key
](
x
),
tau
=
0.2
))
x_final
=
torch
.
cat
((
x_numerical
,
*
x_cat
),
1
)
return
x_final
class
Critic
(
nn
.
Module
):
def
__init__
(
self
,
input_dim
):
super
(
Critic
,
self
).
__init__
()
self
.
_input_dim
=
input_dim
# self.dense1 = nn.Linear(109, 256)
self
.
dense1
=
nn
.
Linear
(
self
.
_input_dim
,
self
.
_input_dim
)
self
.
dense2
=
nn
.
Linear
(
self
.
_input_dim
,
self
.
_input_dim
)
# self.dense3 = nn.Linear(256, 1)
# self.drop = nn.Dropout(p=0.2)
# self.activation = nn.Sigmoid()
def
forward
(
self
,
x
):
x
=
f
.
leaky_relu
(
self
.
dense1
(
x
))
# x = self.drop(x)
# x = f.leaky_relu(self.dense2(x))
x
=
f
.
leaky_relu
(
self
.
dense2
(
x
))
# x = self.drop(x)
return
x
class
FairLossFunc
(
nn
.
Module
):
def
__init__
(
self
,
S_start_index
,
Y_start_index
,
underpriv_index
,
priv_index
,
undesire_index
,
desire_index
):
super
(
FairLossFunc
,
self
).
__init__
()
self
.
_S_start_index
=
S_start_index
self
.
_Y_start_index
=
Y_start_index
self
.
_underpriv_index
=
underpriv_index
self
.
_priv_index
=
priv_index
self
.
_undesire_index
=
undesire_index
self
.
_desire_index
=
desire_index
def
forward
(
self
,
x
,
crit_fake_pred
,
lamda
):
G
=
x
[:,
self
.
_S_start_index
:
self
.
_S_start_index
+
2
]
# print(x[0,64])
I
=
x
[:,
self
.
_Y_start_index
:
self
.
_Y_start_index
+
2
]
# disp = (torch.mean(G[:,1]*I[:,1])/(x[:,65].sum())) - (torch.mean(G[:,0]*I[:,0])/(x[:,64].sum()))
# disp = -1.0 * torch.tanh(torch.mean(G[:,0]*I[:,1])/(x[:,64].sum()) - torch.mean(G[:,1]*I[:,1])/(x[:,65].sum()))
# gen_loss = -1.0 * torch.mean(crit_fake_pred)
disp
=
-
1.0
*
lamda
*
(
torch
.
mean
(
G
[:,
self
.
_underpriv_index
]
*
I
[:,
self
.
_desire_index
])
/
(
x
[:,
self
.
_S_start_index
+
self
.
_underpriv_index
].
sum
())
-
torch
.
mean
(
G
[:,
self
.
_priv_index
]
*
I
[:,
self
.
_desire_index
])
/
(
x
[:,
self
.
_S_start_index
+
self
.
_priv_index
].
sum
()))
-
1.0
*
torch
.
mean
(
crit_fake_pred
)
# print(disp)
return
disp
device
=
torch
.
device
(
"
cuda:0
"
if
(
torch
.
cuda
.
is_available
()
and
1
>
0
)
else
"
cpu
"
)
def
get_gradient
(
crit
,
real
,
fake
,
epsilon
):
mixed_data
=
real
*
epsilon
+
fake
*
(
1
-
epsilon
)
mixed_scores
=
crit
(
mixed_data
)
gradient
=
torch
.
autograd
.
grad
(
inputs
=
mixed_data
,
outputs
=
mixed_scores
,
grad_outputs
=
torch
.
ones_like
(
mixed_scores
),
create_graph
=
True
,
retain_graph
=
True
,
)[
0
]
return
gradient
def
gradient_penalty
(
gradient
):
gradient
=
gradient
.
view
(
len
(
gradient
),
-
1
)
gradient_norm
=
gradient
.
norm
(
2
,
dim
=
1
)
penalty
=
torch
.
mean
((
gradient_norm
-
1
)
**
2
)
return
penalty
def
get_gen_loss
(
crit_fake_pred
):
gen_loss
=
-
1.
*
torch
.
mean
(
crit_fake_pred
)
return
gen_loss
def
get_crit_loss
(
crit_fake_pred
,
crit_real_pred
,
gp
,
c_lambda
):
crit_loss
=
torch
.
mean
(
crit_fake_pred
)
-
torch
.
mean
(
crit_real_pred
)
+
c_lambda
*
gp
return
crit_loss
display_step
=
50
def
train
(
df
,
epochs
=
500
,
batch_size
=
64
,
fair_epochs
=
10
,
lamda
=
0.5
):
if
args
.
command
==
"
with_fairness
"
:
ohe
,
scaler
,
input_dim
,
discrete_columns
,
continuous_columns
,
train_dl
,
data_train
,
data_test
,
S_start_index
,
Y_start_index
,
underpriv_index
,
priv_index
,
undesire_index
,
desire_index
=
prepare_data
(
df
,
batch_size
)
elif
args
.
command
==
"
no_fairness
"
:
ohe
,
scaler
,
input_dim
,
discrete_columns
,
continuous_columns
,
train_dl
,
data_train
,
data_test
=
prepare_data
(
df
,
batch_size
)
generator
=
Generator
(
input_dim
,
continuous_columns
,
discrete_columns
).
to
(
device
)
critic
=
Critic
(
input_dim
).
to
(
device
)
if
args
.
command
==
"
with_fairness
"
:
second_critic
=
FairLossFunc
(
S_start_index
,
Y_start_index
,
underpriv_index
,
priv_index
,
undesire_index
,
desire_index
).
to
(
device
)
gen_optimizer
=
torch
.
optim
.
Adam
(
generator
.
parameters
(),
lr
=
0.00001
,
betas
=
(
0.5
,
0.999
))
# original lr=0.002, betas=(0.5, 0.999))
gen_optimizer_fair
=
torch
.
optim
.
Adam
(
generator
.
parameters
(),
lr
=
0.0001
,
betas
=
(
0.5
,
0.999
))
crit_optimizer
=
torch
.
optim
.
Adam
(
critic
.
parameters
(),
lr
=
0.00001
,
betas
=
(
0.5
,
0.999
))
# original lr=0.002, betas=(0.5, 0.999))
# loss = nn.BCELoss()
critic_losses
=
[]
cur_step
=
0
for
i
in
range
(
epochs
):
# j = 0
print
(
"
epoch {}
"
.
format
(
i
+
1
))
############################
if
i
+
1
<=
(
epochs
-
fair_epochs
):
print
(
"
training for accuracy
"
)
if
i
+
1
>
(
epochs
-
fair_epochs
):
print
(
"
training for fairness
"
)
for
data
in
train_dl
:
data
[
0
]
=
data
[
0
].
to
(
device
)
crit_repeat
=
4
mean_iteration_critic_loss
=
0
for
k
in
range
(
crit_repeat
):
# training the critic
crit_optimizer
.
zero_grad
()
fake_noise
=
torch
.
randn
(
size
=
(
batch_size
,
input_dim
),
device
=
device
).
float
()
fake
=
generator
(
fake_noise
)
crit_fake_pred
=
critic
(
fake
.
detach
())
crit_real_pred
=
critic
(
data
[
0
])
epsilon
=
torch
.
rand
(
batch_size
,
input_dim
,
device
=
device
,
requires_grad
=
True
)
gradient
=
get_gradient
(
critic
,
data
[
0
],
fake
.
detach
(),
epsilon
)
gp
=
gradient_penalty
(
gradient
)
crit_loss
=
get_crit_loss
(
crit_fake_pred
,
crit_real_pred
,
gp
,
c_lambda
=
10
)
#original c_lambda= 10
mean_iteration_critic_loss
+=
crit_loss
.
item
()
/
crit_repeat
crit_loss
.
backward
(
retain_graph
=
True
)
crit_optimizer
.
step
()
#############################
if
cur_step
>
50
:
critic_losses
+=
[
mean_iteration_critic_loss
]
#############################
if
i
+
1
<=
(
epochs
-
fair_epochs
):
# training the generator for accuracy
gen_optimizer
.
zero_grad
()
fake_noise_2
=
torch
.
randn
(
size
=
(
batch_size
,
input_dim
),
device
=
device
).
float
()
fake_2
=
generator
(
fake_noise_2
)
crit_fake_pred
=
critic
(
fake_2
)
gen_loss
=
get_gen_loss
(
crit_fake_pred
)
gen_loss
.
backward
()
# Update the weights
gen_optimizer
.
step
()
###############################
if
i
+
1
>
(
epochs
-
fair_epochs
):
# training the generator for fairness
gen_optimizer_fair
.
zero_grad
()
fake_noise_2
=
torch
.
randn
(
size
=
(
batch_size
,
input_dim
),
device
=
device
).
float
()
fake_2
=
generator
(
fake_noise_2
)
crit_fake_pred
=
critic
(
fake_2
)
gen_fair_loss
=
second_critic
(
fake_2
,
crit_fake_pred
,
lamda
)
gen_fair_loss
.
backward
()
gen_optimizer_fair
.
step
()
cur_step
+=
1
return
generator
,
critic
,
ohe
,
scaler
,
data_train
,
data_test
,
input_dim
def
train_plot
(
df
,
epochs
,
batchsize
,
fair_epochs
,
lamda
):
generator
,
critic
,
ohe
,
scaler
,
data_train
,
data_test
,
input_dim
=
train
(
df
,
epochs
,
batchsize
,
fair_epochs
,
lamda
)
return
generator
,
critic
,
ohe
,
scaler
,
data_train
,
data_test
,
input_dim
if
args
.
command
==
"
with_fairness
"
:
generator
,
critic
,
ohe
,
scaler
,
data_train
,
data_test
,
input_dim
=
train_plot
(
df
,
args
.
num_epochs
,
args
.
batch_size
,
args
.
num_fair_epochs
,
args
.
lambda_val
)
elif
args
.
command
==
"
no_fairness
"
:
generator
,
critic
,
ohe
,
scaler
,
data_train
,
data_test
,
input_dim
=
train_plot
(
df
,
args
.
num_epochs
,
args
.
batch_size
,
0
,
0
)
fake_numpy_array
=
generator
(
torch
.
randn
(
size
=
(
args
.
size_of_fake_data
,
input_dim
),
device
=
device
)).
cpu
().
detach
().
numpy
()
fake_df
=
get_original_data
(
fake_numpy_array
,
df
,
ohe
,
scaler
)
fake_df
=
fake_df
[
df
.
columns
]
fake_df
.
to_csv
(
args
.
fake_name
,
index
=
False
)
```
%% Cell type:code id:8ae0ef89-6190-4f33-927f-3f1e79206786 tags:
```
python
#save file
synthetic_data
=
pd
.
read_csv
(
'
TabFairGAN_Results/test.csv
'
)
# change loation accordingly
```
%% Cell type:code id:cf298876-d496-45ed-b083-01bb162dec27 tags:
```
python
def
get_data_info
(
df
):
"""
Crates the categorical columns, continuous columns, and metadata of a dataframe.
Args:
df (pandas.Dataframe): The input dataframe containing continuous and categorical values.
Returns:
list: the list of categorical column names. Specifically, columns with only 4 uniques values
list: The list of continuous column names.
metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
"""
#createing
categorical_columns
=
[
'
Label
'
]
continuous_columns
=
[]
for
i
in
df
.
columns
:
if
i
not
in
categorical_columns
:
continuous_columns
.
append
(
i
)
#creating metadat
metadata
=
SingleTableMetadata
()
metadata
.
detect_from_dataframe
(
df
)
for
column
in
categorical_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
categorical
'
)
for
column
in
continuous_columns
:
metadata
.
update_column
(
column_name
=
column
,
sdtype
=
'
numerical
'
)
# validating metadata
metadata
.
validate
()
metadata
.
validate_data
(
data
=
real_data
)
return
categorical_columns
,
continuous_columns
,
metadata
categorical_columns
,
continuous_columns
,
metadata
=
get_data_info
(
real_data
)
```
%% Cell type:code id:04e09b22-c681-48ea-b86c-44a643400ebc tags:
```
python
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator
=
TableEvaluator
(
real_data
,
synthetic_data
,
cat_cols
=
categorical_columns
)
table_evaluator
.
visual_evaluation
()
```
%% Cell type:code id:ed1cd643-0342-4bfb-a217-443337684a38 tags:
```
python
#saving and visualizing column pair trend and column shapes
metadata
=
metadata
.
to_dict
()
my_report
=
QualityReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
TabFairGAN_Results/Bot/quality.pkl
'
)
my_report
.
get_visualization
(
property_name
=
'
Column Pair Trends
'
)
```
%% Cell type:code id:3a1ed789-0cb8-43c3-b4d6-4b842093717c tags:
```
python
#saving and visualiztation data validity
my_report
=
DiagnosticReport
()
my_report
.
generate
(
real_data
,
synthetic_data
,
metadata
)
my_report
.
save
(
filepath
=
'
TabFairGAN_Results/Bot/diagnostic.pkl
'
)
my_report
.
get_visualization
(
'
Data Validity
'
)
```
%% Cell type:code id:19b801e3-96cc-474f-971c-cc43990be18d tags:
```
python
#range coverage metric
range_coverage
=
[]
for
i
in
real_data
.
columns
:
y
=
RangeCoverage
.
compute
(
real_data
=
real_data
[
i
],
synthetic_data
=
synthetic_data
[
i
]
)
range_coverage
.
append
(
y
)
df
=
pd
.
DataFrame
(
range_coverage
,
columns
=
[
'
Range Coverage
'
])
print
(
df
[
'
Range Coverage
'
].
mean
())
```
%% Cell type:code id:5f5fd8e3-8d50-43dc-a1f8-783dae4ae7da tags:
```
python
# checking the number of unique synthetic data instances
df
=
pd
.
concat
([
real_data
,
synthetic_data
],
axis
=
0
)
print
(
df
.
shape
)
df
.
dropna
(
inplace
=
True
)
df
.
drop_duplicates
(
inplace
=
True
)
print
(
df
.
shape
)
```
%% Cell type:code id:08386892-ce3f-45f8-be6e-11f184572a80 tags:
```
python
#Saving the distribution of each column
def
sanitize_column_name
(
column_name
):
valid_chars
=
"
-_.() %s%s
"
%
(
string
.
ascii_letters
,
string
.
digits
)
return
''
.
join
(
c
for
c
in
column_name
if
c
in
valid_chars
)
for
i
in
real_data
.
columns
:
fig
=
get_column_plot
(
real_data
=
real_data
,
synthetic_data
=
synthetic_data
,
column_name
=
i
,
plot_type
=
'
bar
'
)
sanitized_column_name
=
sanitize_column_name
(
i
)
# Save the figure in the 'Pics' directory, change the location accordingly
py
.
write_image
(
fig
,
os
.
path
.
join
(
'
TabFairGAN_Results/Bot/Pics
'
,
f
"
{
sanitized_column_name
}
.png
"
))
```
%% Cell type:code id:ff374e9c-1e80-4804-be56-c25e2ca2bc3a tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment