diff --git a/Models/TabFairGAN.ipynb b/Models/TabFairGAN.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..958b5a403112bc82df9328ce986e86ad9d01590a --- /dev/null +++ b/Models/TabFairGAN.ipynb @@ -0,0 +1,649 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0e235976-62c1-4d28-b7a8-bae9e4ab7c8d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sdv.metadata import SingleTableMetadata\n", + "from sdmetrics.reports.single_table import QualityReport\n", + "from sdmetrics.reports.single_table import DiagnosticReport\n", + "from table_evaluator import TableEvaluator\n", + "import matplotlib.pyplot as plt\n", + "from sdmetrics.single_column import StatisticSimilarity\n", + "import math\n", + "from sdmetrics.single_column import RangeCoverage\n", + "from sdmetrics.visualization import get_column_plot\n", + "import os\n", + "import plotly.io as py\n", + "import string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1822d860-da20-4bf3-942f-dab5b7ec6050", + "metadata": {}, + "outputs": [], + "source": [ + "#loading the preprocessed datasets \n", + "\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')\n", + "# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')\n", + "\n", + "print(real_data.shape)\n", + "print(real_data.Label.unique())\n", + "\n", + "# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly\n", + "#real_data=real_data[real_data.Label=='SSH-Bruteforce'] # change according to the dataset\n", + "real_data = real_data.iloc[:300000, :]\n", + "print(real_data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e35c1b44-91c7-4b03-a381-b9092540a2bf", + "metadata": {}, + "outputs": [], + "source": [ + "# Manually set hyperparameters\n", + "class Args:\n", + " num_epochs = 700\n", + " batch_size = 100\n", + " fake_name = 'TabFairGAN_Results/test.csv' # location change accordingly\n", + " size_of_fake_data = 300000 # number instances to be generated\n", + "\n", + "args = Args()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1811d74a-b27d-4f1f-96a8-85fe54f2cc41", + "metadata": {}, + "outputs": [], + "source": [ + "#TabFairGAN code directly copied from GitHub repo\n", + "#https://github.com/amirarsalan90/TabFairGAN\n", + "\n", + "import torch\n", + "import torch.nn.functional as f\n", + "from torch import nn\n", + "import pandas as pd\n", + "import numpy as np\n", + "from collections import OrderedDict\n", + "\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.preprocessing import QuantileTransformer\n", + "from sklearn.model_selection import train_test_split\n", + "import argparse\n", + "\n", + "\n", + "'''parser = argparse.ArgumentParser()\n", + "subparser = parser.add_subparsers(dest='command')\n", + "with_fairness = subparser.add_parser('with_fairness')\n", + "no_fairness = subparser.add_parser('no_fairness')\n", + "\n", + "with_fairness.add_argument(\"df_name\", help=\"Reference dataframe\", type=str)\n", + "with_fairness.add_argument(\"S\", help=\"Protected attribute\", type=str)\n", + "with_fairness.add_argument(\"Y\", help=\"Label (decision)\", type=str)\n", + "with_fairness.add_argument(\"underprivileged_value\", help=\"Value for underpriviledged group\", type=str)\n", + "with_fairness.add_argument(\"desirable_value\", help=\"Desired label (decision)\", type=str)\n", + "with_fairness.add_argument(\"num_epochs\", help=\"Total number of epochs\", type=int)\n", + "with_fairness.add_argument(\"batch_size\", help=\"the batch size\", type=int)\n", + "with_fairness.add_argument(\"num_fair_epochs\", help=\"number of fair training epochs\", type=int)\n", + "with_fairness.add_argument(\"lambda_val\", help=\"lambda parameter\", type=float)\n", + "with_fairness.add_argument(\"fake_name\", help=\"name of the produced csv file\", type=str)\n", + "with_fairness.add_argument(\"size_of_fake_data\", help=\"how many data records to generate\", type=int)\n", + "\n", + "\n", + "no_fairness.add_argument(\"df_name\", help=\"Reference dataframe\", type=str)\n", + "no_fairness.add_argument(\"num_epochs\", help=\"Total number of epochs\", type=int)\n", + "no_fairness.add_argument(\"batch_size\", help=\"the batch size\", type=int)\n", + "no_fairness.add_argument(\"fake_name\", help=\"name of the produced csv file\", type=str)\n", + "no_fairness.add_argument(\"size_of_fake_data\", help=\"how many data records to generate\", type=int)\n", + "\n", + "args = parser.parse_args()'''\n", + "\n", + "if args.command == 'with_fairness':\n", + " S = args.S\n", + " Y = args.Y\n", + " S_under = args.underprivileged_value\n", + " Y_desire = args.desirable_value\n", + "\n", + " df = pd.read_csv(args.df_name)\n", + "\n", + " df[S] = df[S].astype(object)\n", + " df[Y] = df[Y].astype(object)\n", + "\n", + "elif args.command == 'no_fairness':\n", + " df = real_data\n", + "\n", + "\n", + "if args.command == \"with_fairness\":\n", + " def get_ohe_data(df):\n", + " df_int = df.select_dtypes(['float', 'integer']).values\n", + " continuous_columns_list = list(df.select_dtypes(['float', 'integer']).columns)\n", + " ##############################################################\n", + " scaler = QuantileTransformer(n_quantiles=2000, output_distribution='uniform')\n", + " df_int = scaler.fit_transform(df_int)\n", + "\n", + " df_cat = df.select_dtypes('object')\n", + " df_cat_names = list(df.select_dtypes('object').columns)\n", + " numerical_array = df_int\n", + " ohe = OneHotEncoder()\n", + " ohe_array = ohe.fit_transform(df_cat)\n", + "\n", + " cat_lens = [i.shape[0] for i in ohe.categories_]\n", + " discrete_columns_ordereddict = OrderedDict(zip(df_cat_names, cat_lens))\n", + "\n", + " S_start_index = len(continuous_columns_list) + sum(\n", + " list(discrete_columns_ordereddict.values())[:list(discrete_columns_ordereddict.keys()).index(S)])\n", + " Y_start_index = len(continuous_columns_list) + sum(\n", + " list(discrete_columns_ordereddict.values())[:list(discrete_columns_ordereddict.keys()).index(Y)])\n", + "\n", + " if ohe.categories_[list(discrete_columns_ordereddict.keys()).index(S)][0] == S_under:\n", + " underpriv_index = 0\n", + " priv_index = 1\n", + " else:\n", + " underpriv_index = 1\n", + " priv_index = 0\n", + " if ohe.categories_[list(discrete_columns_ordereddict.keys()).index(Y)][0] == Y_desire:\n", + " desire_index = 0\n", + " undesire_index = 1\n", + " else:\n", + " desire_index = 1\n", + " undesire_index = 0\n", + "\n", + " final_array = np.hstack((numerical_array, ohe_array.toarray()))\n", + " return ohe, scaler, discrete_columns_ordereddict, continuous_columns_list, final_array, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index\n", + "\n", + "elif args.command == \"no_fairness\":\n", + " def get_ohe_data(df):\n", + " df_int = df.select_dtypes(['float', 'integer']).values\n", + " continuous_columns_list = list(df.select_dtypes(['float', 'integer']).columns)\n", + " ##############################################################\n", + " scaler = QuantileTransformer(n_quantiles=2000, output_distribution='uniform')\n", + " df_int = scaler.fit_transform(df_int)\n", + "\n", + " df_cat = df.select_dtypes('object')\n", + " df_cat_names = list(df.select_dtypes('object').columns)\n", + " numerical_array = df_int\n", + " ohe = OneHotEncoder()\n", + " ohe_array = ohe.fit_transform(df_cat)\n", + "\n", + " cat_lens = [i.shape[0] for i in ohe.categories_]\n", + " discrete_columns_ordereddict = OrderedDict(zip(df_cat_names, cat_lens))\n", + "\n", + "\n", + " final_array = np.hstack((numerical_array, ohe_array.toarray()))\n", + " return ohe, scaler, discrete_columns_ordereddict, continuous_columns_list, final_array\n", + "\n", + "\n", + "def get_original_data(df_transformed, df_orig, ohe, scaler):\n", + " df_ohe_int = df_transformed[:, :df_orig.select_dtypes(['float', 'integer']).shape[1]]\n", + " df_ohe_int = scaler.inverse_transform(df_ohe_int)\n", + " df_ohe_cats = df_transformed[:, df_orig.select_dtypes(['float', 'integer']).shape[1]:]\n", + " df_ohe_cats = ohe.inverse_transform(df_ohe_cats)\n", + " df_int = pd.DataFrame(df_ohe_int, columns=df_orig.select_dtypes(['float', 'integer']).columns)\n", + " df_cat = pd.DataFrame(df_ohe_cats, columns=df_orig.select_dtypes('object').columns)\n", + " return pd.concat([df_int, df_cat], axis=1)\n", + "\n", + "\n", + "if args.command == \"with_fairness\":\n", + " def prepare_data(df, batch_size):\n", + " ohe, scaler, discrete_columns, continuous_columns, df_transformed, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index = get_ohe_data(df)\n", + " input_dim = 100\n", + " X_train, X_test = train_test_split(df_transformed,test_size=0.1, shuffle=True)\n", + " data_train = X_train.copy()\n", + " data_test = X_test.copy()\n", + "\n", + " from torch.utils.data import TensorDataset\n", + " from torch.utils.data import DataLoader\n", + " data = torch.from_numpy(data_train).float()\n", + "\n", + "\n", + " train_ds = TensorDataset(data)\n", + " train_dl = DataLoader(train_ds, batch_size = batch_size, drop_last=True)\n", + " return ohe, scaler, input_dim, discrete_columns, continuous_columns ,train_dl, data_train, data_test, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index\n", + "\n", + "elif args.command == \"no_fairness\":\n", + " def prepare_data(df, batch_size):\n", + " #df = pd.concat([df_train, df_test], axis=0)\n", + "\n", + " ohe, scaler, discrete_columns, continuous_columns, df_transformed = get_ohe_data(df)\n", + "\n", + "\n", + " input_dim = df_transformed.shape[1]\n", + "\n", + " #from sklearn.model_selection import train_test_split\n", + " #################\n", + " X_train, X_test = train_test_split(df_transformed,test_size=0.1, shuffle=True) #random_state=10)\n", + " #X_train = df_transformed[:df_train.shape[0],:]\n", + " #X_test = df_transformed[df_train.shape[0]:,:]\n", + "\n", + " data_train = X_train.copy()\n", + " data_test = X_test.copy()\n", + "\n", + " from torch.utils.data import TensorDataset\n", + " from torch.utils.data import DataLoader\n", + " data = torch.from_numpy(data_train).float()\n", + "\n", + "\n", + " train_ds = TensorDataset(data)\n", + " train_dl = DataLoader(train_ds, batch_size = batch_size, drop_last=True)\n", + " return ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test\n", + "\n", + "\n", + "\n", + "class Generator(nn.Module):\n", + " def __init__(self, input_dim, continuous_columns, discrete_columns):\n", + " super(Generator, self).__init__()\n", + " self._input_dim = input_dim\n", + " self._discrete_columns = discrete_columns\n", + " self._num_continuous_columns = len(continuous_columns)\n", + "\n", + " self.lin1 = nn.Linear(self._input_dim, self._input_dim)\n", + " self.lin_numerical = nn.Linear(self._input_dim, self._num_continuous_columns)\n", + "\n", + " self.lin_cat = nn.ModuleDict()\n", + " for key, value in self._discrete_columns.items():\n", + " self.lin_cat[key] = nn.Linear(self._input_dim, value)\n", + "\n", + " def forward(self, x):\n", + " x = torch.relu(self.lin1(x))\n", + " # x = f.leaky_relu(self.lin1(x))\n", + " # x_numerical = f.leaky_relu(self.lin_numerical(x))\n", + " x_numerical = f.relu(self.lin_numerical(x))\n", + " x_cat = []\n", + " for key in self.lin_cat:\n", + " x_cat.append(f.gumbel_softmax(self.lin_cat[key](x), tau=0.2))\n", + " x_final = torch.cat((x_numerical, *x_cat), 1)\n", + " return x_final\n", + "\n", + "\n", + "class Critic(nn.Module):\n", + " def __init__(self, input_dim):\n", + " super(Critic, self).__init__()\n", + " self._input_dim = input_dim\n", + " # self.dense1 = nn.Linear(109, 256)\n", + " self.dense1 = nn.Linear(self._input_dim, self._input_dim)\n", + " self.dense2 = nn.Linear(self._input_dim, self._input_dim)\n", + " # self.dense3 = nn.Linear(256, 1)\n", + " # self.drop = nn.Dropout(p=0.2)\n", + " # self.activation = nn.Sigmoid()\n", + "\n", + " def forward(self, x):\n", + " x = f.leaky_relu(self.dense1(x))\n", + " # x = self.drop(x)\n", + " # x = f.leaky_relu(self.dense2(x))\n", + " x = f.leaky_relu(self.dense2(x))\n", + " # x = self.drop(x)\n", + " return x\n", + "\n", + "\n", + "class FairLossFunc(nn.Module):\n", + " def __init__(self, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index):\n", + " super(FairLossFunc, self).__init__()\n", + " self._S_start_index = S_start_index\n", + " self._Y_start_index = Y_start_index\n", + " self._underpriv_index = underpriv_index\n", + " self._priv_index = priv_index\n", + " self._undesire_index = undesire_index\n", + " self._desire_index = desire_index\n", + "\n", + " def forward(self, x, crit_fake_pred, lamda):\n", + " G = x[:, self._S_start_index:self._S_start_index + 2]\n", + " # print(x[0,64])\n", + " I = x[:, self._Y_start_index:self._Y_start_index + 2]\n", + " # disp = (torch.mean(G[:,1]*I[:,1])/(x[:,65].sum())) - (torch.mean(G[:,0]*I[:,0])/(x[:,64].sum()))\n", + " # disp = -1.0 * torch.tanh(torch.mean(G[:,0]*I[:,1])/(x[:,64].sum()) - torch.mean(G[:,1]*I[:,1])/(x[:,65].sum()))\n", + " # gen_loss = -1.0 * torch.mean(crit_fake_pred)\n", + " disp = -1.0 * lamda * (torch.mean(G[:, self._underpriv_index] * I[:, self._desire_index]) / (\n", + " x[:, self._S_start_index + self._underpriv_index].sum()) - torch.mean(\n", + " G[:, self._priv_index] * I[:, self._desire_index]) / (\n", + " x[:, self._S_start_index + self._priv_index].sum())) - 1.0 * torch.mean(\n", + " crit_fake_pred)\n", + " # print(disp)\n", + " return disp\n", + "\n", + "device = torch.device(\"cuda:0\" if (torch.cuda.is_available() and 1 > 0) else \"cpu\")\n", + "\n", + "def get_gradient(crit, real, fake, epsilon):\n", + " mixed_data = real * epsilon + fake * (1 - epsilon)\n", + "\n", + " mixed_scores = crit(mixed_data)\n", + "\n", + " gradient = torch.autograd.grad(\n", + " inputs=mixed_data,\n", + " outputs=mixed_scores,\n", + " grad_outputs=torch.ones_like(mixed_scores),\n", + " create_graph=True,\n", + " retain_graph=True,\n", + " )[0]\n", + " return gradient\n", + "\n", + "def gradient_penalty(gradient):\n", + " gradient = gradient.view(len(gradient), -1)\n", + " gradient_norm = gradient.norm(2, dim=1)\n", + "\n", + " penalty = torch.mean((gradient_norm - 1) ** 2)\n", + " return penalty\n", + "\n", + "\n", + "def get_gen_loss(crit_fake_pred):\n", + " gen_loss = -1. * torch.mean(crit_fake_pred)\n", + "\n", + " return gen_loss\n", + "\n", + "\n", + "def get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda):\n", + " crit_loss = torch.mean(crit_fake_pred) - torch.mean(crit_real_pred) + c_lambda * gp\n", + "\n", + " return crit_loss\n", + "\n", + "\n", + "display_step = 50\n", + "\n", + "\n", + "def train(df, epochs=500, batch_size=64, fair_epochs=10, lamda=0.5):\n", + " if args.command == \"with_fairness\":\n", + " ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index = prepare_data(df, batch_size)\n", + " elif args.command == \"no_fairness\":\n", + " ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test = prepare_data(df, batch_size)\n", + "\n", + " generator = Generator(input_dim, continuous_columns, discrete_columns).to(device)\n", + " critic = Critic(input_dim).to(device)\n", + " if args.command == \"with_fairness\":\n", + " second_critic = FairLossFunc(S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index).to(device)\n", + "\n", + " gen_optimizer = torch.optim.Adam(generator.parameters(), lr=0.00001, betas=(0.5, 0.999)) # original lr=0.002, betas=(0.5, 0.999))\n", + " gen_optimizer_fair = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))\n", + " crit_optimizer = torch.optim.Adam(critic.parameters(), lr=0.00001, betas=(0.5, 0.999)) # original lr=0.002, betas=(0.5, 0.999))\n", + "\n", + " # loss = nn.BCELoss()\n", + " critic_losses = []\n", + " cur_step = 0\n", + " for i in range(epochs):\n", + " # j = 0\n", + " print(\"epoch {}\".format(i + 1))\n", + " ############################\n", + " if i + 1 <= (epochs - fair_epochs):\n", + " print(\"training for accuracy\")\n", + " if i + 1 > (epochs - fair_epochs):\n", + " print(\"training for fairness\")\n", + " for data in train_dl:\n", + " data[0] = data[0].to(device)\n", + " crit_repeat = 4\n", + " mean_iteration_critic_loss = 0\n", + " for k in range(crit_repeat):\n", + " # training the critic\n", + " crit_optimizer.zero_grad()\n", + " fake_noise = torch.randn(size=(batch_size, input_dim), device=device).float()\n", + " fake = generator(fake_noise)\n", + "\n", + " crit_fake_pred = critic(fake.detach())\n", + " crit_real_pred = critic(data[0])\n", + "\n", + " epsilon = torch.rand(batch_size, input_dim, device=device, requires_grad=True)\n", + " gradient = get_gradient(critic, data[0], fake.detach(), epsilon)\n", + " gp = gradient_penalty(gradient)\n", + "\n", + " crit_loss = get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda=10) #original c_lambda= 10\n", + "\n", + " mean_iteration_critic_loss += crit_loss.item() / crit_repeat\n", + " crit_loss.backward(retain_graph=True)\n", + " crit_optimizer.step()\n", + " #############################\n", + " if cur_step > 50:\n", + " critic_losses += [mean_iteration_critic_loss]\n", + "\n", + " #############################\n", + " if i + 1 <= (epochs - fair_epochs):\n", + " # training the generator for accuracy\n", + " gen_optimizer.zero_grad()\n", + " fake_noise_2 = torch.randn(size=(batch_size, input_dim), device=device).float()\n", + " fake_2 = generator(fake_noise_2)\n", + " crit_fake_pred = critic(fake_2)\n", + "\n", + " gen_loss = get_gen_loss(crit_fake_pred)\n", + " gen_loss.backward()\n", + "\n", + " # Update the weights\n", + " gen_optimizer.step()\n", + "\n", + " ###############################\n", + " if i + 1 > (epochs - fair_epochs):\n", + " # training the generator for fairness\n", + " gen_optimizer_fair.zero_grad()\n", + " fake_noise_2 = torch.randn(size=(batch_size, input_dim), device=device).float()\n", + " fake_2 = generator(fake_noise_2)\n", + "\n", + " crit_fake_pred = critic(fake_2)\n", + "\n", + " gen_fair_loss = second_critic(fake_2, crit_fake_pred, lamda)\n", + " gen_fair_loss.backward()\n", + " gen_optimizer_fair.step()\n", + " cur_step += 1\n", + "\n", + " return generator, critic, ohe, scaler, data_train, data_test, input_dim\n", + "\n", + "\n", + "def train_plot(df, epochs, batchsize, fair_epochs, lamda):\n", + " generator, critic, ohe, scaler, data_train, data_test, input_dim = train(df, epochs, batchsize, fair_epochs, lamda)\n", + " return generator, critic, ohe, scaler, data_train, data_test, input_dim\n", + "\n", + "\n", + "if args.command == \"with_fairness\":\n", + " generator, critic, ohe, scaler, data_train, data_test, input_dim = train_plot(df, args.num_epochs, args.batch_size, args.num_fair_epochs, args.lambda_val)\n", + "elif args.command == \"no_fairness\":\n", + " generator, critic, ohe, scaler, data_train, data_test, input_dim = train_plot(df, args.num_epochs, args.batch_size, 0, 0)\n", + "fake_numpy_array = generator(torch.randn(size=(args.size_of_fake_data, input_dim), device=device)).cpu().detach().numpy()\n", + "fake_df = get_original_data(fake_numpy_array, df, ohe, scaler)\n", + "fake_df = fake_df[df.columns]\n", + "fake_df.to_csv(args.fake_name, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ae0ef89-6190-4f33-927f-3f1e79206786", + "metadata": {}, + "outputs": [], + "source": [ + "#save file\n", + "synthetic_data = pd.read_csv('TabFairGAN_Results/test.csv') # change loation accordingly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf298876-d496-45ed-b083-01bb162dec27", + "metadata": {}, + "outputs": [], + "source": [ + "def get_data_info(df):\n", + " \"\"\"Crates the categorical columns, continuous columns, and metadata of a dataframe.\n", + "\n", + " Args:\n", + " df (pandas.Dataframe): The input dataframe containing continuous and categorical values.\n", + "\n", + " Returns:\n", + " list: the list of categorical column names. Specifically, columns with only 4 uniques values\n", + " list: The list of continuous column names.\n", + " metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json\n", + " \"\"\"\n", + " #createing \n", + " categorical_columns = ['Label']\n", + " continuous_columns = []\n", + " for i in df.columns:\n", + " if i not in categorical_columns:\n", + " continuous_columns.append(i)\n", + " \n", + " #creating metadat\n", + " metadata = SingleTableMetadata()\n", + " metadata.detect_from_dataframe(df)\n", + " \n", + " for column in categorical_columns:\n", + " metadata.update_column(\n", + " column_name = column,\n", + " sdtype = 'categorical'\n", + " )\n", + " \n", + " for column in continuous_columns:\n", + " metadata.update_column(\n", + " column_name = column,\n", + " sdtype = 'numerical' \n", + " )\n", + " # validating metadata\n", + " metadata.validate()\n", + " metadata.validate_data(data=real_data)\n", + " \n", + " return categorical_columns, continuous_columns, metadata\n", + "\n", + "\n", + "categorical_columns, continuous_columns, metadata = get_data_info(real_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04e09b22-c681-48ea-b86c-44a643400ebc", + "metadata": {}, + "outputs": [], + "source": [ + "# evaluating synthetic data with table_evaluator cumulative sum per features and distribution\n", + "table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)\n", + "table_evaluator.visual_evaluation()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed1cd643-0342-4bfb-a217-443337684a38", + "metadata": {}, + "outputs": [], + "source": [ + "#saving and visualizing column pair trend and column shapes\n", + "metadata = metadata.to_dict()\n", + "my_report = QualityReport()\n", + "my_report.generate(real_data, synthetic_data, metadata)\n", + "my_report.save(filepath='TabFairGAN_Results/Bot/quality.pkl')\n", + "my_report.get_visualization(property_name='Column Pair Trends')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a1ed789-0cb8-43c3-b4d6-4b842093717c", + "metadata": {}, + "outputs": [], + "source": [ + "#saving and visualiztation data validity\n", + "my_report = DiagnosticReport()\n", + "my_report.generate(real_data, synthetic_data, metadata)\n", + "my_report.save(filepath='TabFairGAN_Results/Bot/diagnostic.pkl')\n", + "my_report.get_visualization('Data Validity')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19b801e3-96cc-474f-971c-cc43990be18d", + "metadata": {}, + "outputs": [], + "source": [ + "#range coverage metric\n", + "range_coverage=[]\n", + "for i in real_data.columns:\n", + " \n", + " y=RangeCoverage.compute(\n", + " real_data=real_data[i],\n", + " synthetic_data=synthetic_data[i]\n", + " )\n", + " range_coverage.append(y)\n", + "df = pd.DataFrame(range_coverage, columns=['Range Coverage'])\n", + "\n", + "print(df['Range Coverage'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f5fd8e3-8d50-43dc-a1f8-783dae4ae7da", + "metadata": {}, + "outputs": [], + "source": [ + "# checking the number of unique synthetic data instances\n", + "df = pd.concat([real_data, synthetic_data], axis=0)\n", + "print(df.shape)\n", + "df.dropna(inplace=True)\n", + "df.drop_duplicates(inplace=True)\n", + "print(df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08386892-ce3f-45f8-be6e-11f184572a80", + "metadata": {}, + "outputs": [], + "source": [ + "#Saving the distribution of each column\n", + "def sanitize_column_name(column_name):\n", + " valid_chars = \"-_.() %s%s\" % (string.ascii_letters, string.digits)\n", + " return ''.join(c for c in column_name if c in valid_chars)\n", + "\n", + "for i in real_data.columns:\n", + " fig = get_column_plot(\n", + " real_data=real_data,\n", + " synthetic_data=synthetic_data,\n", + " column_name=i,\n", + " plot_type='bar'\n", + " )\n", + "\n", + " sanitized_column_name = sanitize_column_name(i)\n", + "\n", + " # Save the figure in the 'Pics' directory, change the location accordingly\n", + " py.write_image(fig, os.path.join('TabFairGAN_Results/Bot/Pics', f\"{sanitized_column_name}.png\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff374e9c-1e80-4804-be56-c25e2ca2bc3a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}