Skip to content
Snippets Groups Projects
Commit 9b2f5dff authored by Sayed Saeedi's avatar Sayed Saeedi
Browse files

Code of TabFairGAN

parent 1a322564
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:0e235976-62c1-4d28-b7a8-bae9e4ab7c8d tags:
``` python
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.reports.single_table import DiagnosticReport
from table_evaluator import TableEvaluator
import matplotlib.pyplot as plt
from sdmetrics.single_column import StatisticSimilarity
import math
from sdmetrics.single_column import RangeCoverage
from sdmetrics.visualization import get_column_plot
import os
import plotly.io as py
import string
```
%% Cell type:code id:1822d860-da20-4bf3-942f-dab5b7ec6050 tags:
``` python
#loading the preprocessed datasets
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/benign.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bot_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/bruteforce_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/doS_attacks.csv')
# real_data = pd.read_csv('Datasets/Preprocessed_Datasets/infilteration_attacks.csv')
print(real_data.shape)
print(real_data.Label.unique())
# if bruteforce_attack or dos_attacks are used then uncomment the below line and change the name of the dataset accordingly
#real_data=real_data[real_data.Label=='SSH-Bruteforce'] # change according to the dataset
real_data = real_data.iloc[:300000, :]
print(real_data.shape)
```
%% Cell type:code id:e35c1b44-91c7-4b03-a381-b9092540a2bf tags:
``` python
# Manually set hyperparameters
class Args:
num_epochs = 700
batch_size = 100
fake_name = 'TabFairGAN_Results/test.csv' # location change accordingly
size_of_fake_data = 300000 # number instances to be generated
args = Args()
```
%% Cell type:code id:1811d74a-b27d-4f1f-96a8-85fe54f2cc41 tags:
``` python
#TabFairGAN code directly copied from GitHub repo
#https://github.com/amirarsalan90/TabFairGAN
import torch
import torch.nn.functional as f
from torch import nn
import pandas as pd
import numpy as np
from collections import OrderedDict
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
import argparse
'''parser = argparse.ArgumentParser()
subparser = parser.add_subparsers(dest='command')
with_fairness = subparser.add_parser('with_fairness')
no_fairness = subparser.add_parser('no_fairness')
with_fairness.add_argument("df_name", help="Reference dataframe", type=str)
with_fairness.add_argument("S", help="Protected attribute", type=str)
with_fairness.add_argument("Y", help="Label (decision)", type=str)
with_fairness.add_argument("underprivileged_value", help="Value for underpriviledged group", type=str)
with_fairness.add_argument("desirable_value", help="Desired label (decision)", type=str)
with_fairness.add_argument("num_epochs", help="Total number of epochs", type=int)
with_fairness.add_argument("batch_size", help="the batch size", type=int)
with_fairness.add_argument("num_fair_epochs", help="number of fair training epochs", type=int)
with_fairness.add_argument("lambda_val", help="lambda parameter", type=float)
with_fairness.add_argument("fake_name", help="name of the produced csv file", type=str)
with_fairness.add_argument("size_of_fake_data", help="how many data records to generate", type=int)
no_fairness.add_argument("df_name", help="Reference dataframe", type=str)
no_fairness.add_argument("num_epochs", help="Total number of epochs", type=int)
no_fairness.add_argument("batch_size", help="the batch size", type=int)
no_fairness.add_argument("fake_name", help="name of the produced csv file", type=str)
no_fairness.add_argument("size_of_fake_data", help="how many data records to generate", type=int)
args = parser.parse_args()'''
if args.command == 'with_fairness':
S = args.S
Y = args.Y
S_under = args.underprivileged_value
Y_desire = args.desirable_value
df = pd.read_csv(args.df_name)
df[S] = df[S].astype(object)
df[Y] = df[Y].astype(object)
elif args.command == 'no_fairness':
df = real_data
if args.command == "with_fairness":
def get_ohe_data(df):
df_int = df.select_dtypes(['float', 'integer']).values
continuous_columns_list = list(df.select_dtypes(['float', 'integer']).columns)
##############################################################
scaler = QuantileTransformer(n_quantiles=2000, output_distribution='uniform')
df_int = scaler.fit_transform(df_int)
df_cat = df.select_dtypes('object')
df_cat_names = list(df.select_dtypes('object').columns)
numerical_array = df_int
ohe = OneHotEncoder()
ohe_array = ohe.fit_transform(df_cat)
cat_lens = [i.shape[0] for i in ohe.categories_]
discrete_columns_ordereddict = OrderedDict(zip(df_cat_names, cat_lens))
S_start_index = len(continuous_columns_list) + sum(
list(discrete_columns_ordereddict.values())[:list(discrete_columns_ordereddict.keys()).index(S)])
Y_start_index = len(continuous_columns_list) + sum(
list(discrete_columns_ordereddict.values())[:list(discrete_columns_ordereddict.keys()).index(Y)])
if ohe.categories_[list(discrete_columns_ordereddict.keys()).index(S)][0] == S_under:
underpriv_index = 0
priv_index = 1
else:
underpriv_index = 1
priv_index = 0
if ohe.categories_[list(discrete_columns_ordereddict.keys()).index(Y)][0] == Y_desire:
desire_index = 0
undesire_index = 1
else:
desire_index = 1
undesire_index = 0
final_array = np.hstack((numerical_array, ohe_array.toarray()))
return ohe, scaler, discrete_columns_ordereddict, continuous_columns_list, final_array, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index
elif args.command == "no_fairness":
def get_ohe_data(df):
df_int = df.select_dtypes(['float', 'integer']).values
continuous_columns_list = list(df.select_dtypes(['float', 'integer']).columns)
##############################################################
scaler = QuantileTransformer(n_quantiles=2000, output_distribution='uniform')
df_int = scaler.fit_transform(df_int)
df_cat = df.select_dtypes('object')
df_cat_names = list(df.select_dtypes('object').columns)
numerical_array = df_int
ohe = OneHotEncoder()
ohe_array = ohe.fit_transform(df_cat)
cat_lens = [i.shape[0] for i in ohe.categories_]
discrete_columns_ordereddict = OrderedDict(zip(df_cat_names, cat_lens))
final_array = np.hstack((numerical_array, ohe_array.toarray()))
return ohe, scaler, discrete_columns_ordereddict, continuous_columns_list, final_array
def get_original_data(df_transformed, df_orig, ohe, scaler):
df_ohe_int = df_transformed[:, :df_orig.select_dtypes(['float', 'integer']).shape[1]]
df_ohe_int = scaler.inverse_transform(df_ohe_int)
df_ohe_cats = df_transformed[:, df_orig.select_dtypes(['float', 'integer']).shape[1]:]
df_ohe_cats = ohe.inverse_transform(df_ohe_cats)
df_int = pd.DataFrame(df_ohe_int, columns=df_orig.select_dtypes(['float', 'integer']).columns)
df_cat = pd.DataFrame(df_ohe_cats, columns=df_orig.select_dtypes('object').columns)
return pd.concat([df_int, df_cat], axis=1)
if args.command == "with_fairness":
def prepare_data(df, batch_size):
ohe, scaler, discrete_columns, continuous_columns, df_transformed, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index = get_ohe_data(df)
input_dim = 100
X_train, X_test = train_test_split(df_transformed,test_size=0.1, shuffle=True)
data_train = X_train.copy()
data_test = X_test.copy()
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
data = torch.from_numpy(data_train).float()
train_ds = TensorDataset(data)
train_dl = DataLoader(train_ds, batch_size = batch_size, drop_last=True)
return ohe, scaler, input_dim, discrete_columns, continuous_columns ,train_dl, data_train, data_test, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index
elif args.command == "no_fairness":
def prepare_data(df, batch_size):
#df = pd.concat([df_train, df_test], axis=0)
ohe, scaler, discrete_columns, continuous_columns, df_transformed = get_ohe_data(df)
input_dim = df_transformed.shape[1]
#from sklearn.model_selection import train_test_split
#################
X_train, X_test = train_test_split(df_transformed,test_size=0.1, shuffle=True) #random_state=10)
#X_train = df_transformed[:df_train.shape[0],:]
#X_test = df_transformed[df_train.shape[0]:,:]
data_train = X_train.copy()
data_test = X_test.copy()
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
data = torch.from_numpy(data_train).float()
train_ds = TensorDataset(data)
train_dl = DataLoader(train_ds, batch_size = batch_size, drop_last=True)
return ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test
class Generator(nn.Module):
def __init__(self, input_dim, continuous_columns, discrete_columns):
super(Generator, self).__init__()
self._input_dim = input_dim
self._discrete_columns = discrete_columns
self._num_continuous_columns = len(continuous_columns)
self.lin1 = nn.Linear(self._input_dim, self._input_dim)
self.lin_numerical = nn.Linear(self._input_dim, self._num_continuous_columns)
self.lin_cat = nn.ModuleDict()
for key, value in self._discrete_columns.items():
self.lin_cat[key] = nn.Linear(self._input_dim, value)
def forward(self, x):
x = torch.relu(self.lin1(x))
# x = f.leaky_relu(self.lin1(x))
# x_numerical = f.leaky_relu(self.lin_numerical(x))
x_numerical = f.relu(self.lin_numerical(x))
x_cat = []
for key in self.lin_cat:
x_cat.append(f.gumbel_softmax(self.lin_cat[key](x), tau=0.2))
x_final = torch.cat((x_numerical, *x_cat), 1)
return x_final
class Critic(nn.Module):
def __init__(self, input_dim):
super(Critic, self).__init__()
self._input_dim = input_dim
# self.dense1 = nn.Linear(109, 256)
self.dense1 = nn.Linear(self._input_dim, self._input_dim)
self.dense2 = nn.Linear(self._input_dim, self._input_dim)
# self.dense3 = nn.Linear(256, 1)
# self.drop = nn.Dropout(p=0.2)
# self.activation = nn.Sigmoid()
def forward(self, x):
x = f.leaky_relu(self.dense1(x))
# x = self.drop(x)
# x = f.leaky_relu(self.dense2(x))
x = f.leaky_relu(self.dense2(x))
# x = self.drop(x)
return x
class FairLossFunc(nn.Module):
def __init__(self, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index):
super(FairLossFunc, self).__init__()
self._S_start_index = S_start_index
self._Y_start_index = Y_start_index
self._underpriv_index = underpriv_index
self._priv_index = priv_index
self._undesire_index = undesire_index
self._desire_index = desire_index
def forward(self, x, crit_fake_pred, lamda):
G = x[:, self._S_start_index:self._S_start_index + 2]
# print(x[0,64])
I = x[:, self._Y_start_index:self._Y_start_index + 2]
# disp = (torch.mean(G[:,1]*I[:,1])/(x[:,65].sum())) - (torch.mean(G[:,0]*I[:,0])/(x[:,64].sum()))
# disp = -1.0 * torch.tanh(torch.mean(G[:,0]*I[:,1])/(x[:,64].sum()) - torch.mean(G[:,1]*I[:,1])/(x[:,65].sum()))
# gen_loss = -1.0 * torch.mean(crit_fake_pred)
disp = -1.0 * lamda * (torch.mean(G[:, self._underpriv_index] * I[:, self._desire_index]) / (
x[:, self._S_start_index + self._underpriv_index].sum()) - torch.mean(
G[:, self._priv_index] * I[:, self._desire_index]) / (
x[:, self._S_start_index + self._priv_index].sum())) - 1.0 * torch.mean(
crit_fake_pred)
# print(disp)
return disp
device = torch.device("cuda:0" if (torch.cuda.is_available() and 1 > 0) else "cpu")
def get_gradient(crit, real, fake, epsilon):
mixed_data = real * epsilon + fake * (1 - epsilon)
mixed_scores = crit(mixed_data)
gradient = torch.autograd.grad(
inputs=mixed_data,
outputs=mixed_scores,
grad_outputs=torch.ones_like(mixed_scores),
create_graph=True,
retain_graph=True,
)[0]
return gradient
def gradient_penalty(gradient):
gradient = gradient.view(len(gradient), -1)
gradient_norm = gradient.norm(2, dim=1)
penalty = torch.mean((gradient_norm - 1) ** 2)
return penalty
def get_gen_loss(crit_fake_pred):
gen_loss = -1. * torch.mean(crit_fake_pred)
return gen_loss
def get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda):
crit_loss = torch.mean(crit_fake_pred) - torch.mean(crit_real_pred) + c_lambda * gp
return crit_loss
display_step = 50
def train(df, epochs=500, batch_size=64, fair_epochs=10, lamda=0.5):
if args.command == "with_fairness":
ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test, S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index = prepare_data(df, batch_size)
elif args.command == "no_fairness":
ohe, scaler, input_dim, discrete_columns, continuous_columns, train_dl, data_train, data_test = prepare_data(df, batch_size)
generator = Generator(input_dim, continuous_columns, discrete_columns).to(device)
critic = Critic(input_dim).to(device)
if args.command == "with_fairness":
second_critic = FairLossFunc(S_start_index, Y_start_index, underpriv_index, priv_index, undesire_index, desire_index).to(device)
gen_optimizer = torch.optim.Adam(generator.parameters(), lr=0.00001, betas=(0.5, 0.999)) # original lr=0.002, betas=(0.5, 0.999))
gen_optimizer_fair = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
crit_optimizer = torch.optim.Adam(critic.parameters(), lr=0.00001, betas=(0.5, 0.999)) # original lr=0.002, betas=(0.5, 0.999))
# loss = nn.BCELoss()
critic_losses = []
cur_step = 0
for i in range(epochs):
# j = 0
print("epoch {}".format(i + 1))
############################
if i + 1 <= (epochs - fair_epochs):
print("training for accuracy")
if i + 1 > (epochs - fair_epochs):
print("training for fairness")
for data in train_dl:
data[0] = data[0].to(device)
crit_repeat = 4
mean_iteration_critic_loss = 0
for k in range(crit_repeat):
# training the critic
crit_optimizer.zero_grad()
fake_noise = torch.randn(size=(batch_size, input_dim), device=device).float()
fake = generator(fake_noise)
crit_fake_pred = critic(fake.detach())
crit_real_pred = critic(data[0])
epsilon = torch.rand(batch_size, input_dim, device=device, requires_grad=True)
gradient = get_gradient(critic, data[0], fake.detach(), epsilon)
gp = gradient_penalty(gradient)
crit_loss = get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda=10) #original c_lambda= 10
mean_iteration_critic_loss += crit_loss.item() / crit_repeat
crit_loss.backward(retain_graph=True)
crit_optimizer.step()
#############################
if cur_step > 50:
critic_losses += [mean_iteration_critic_loss]
#############################
if i + 1 <= (epochs - fair_epochs):
# training the generator for accuracy
gen_optimizer.zero_grad()
fake_noise_2 = torch.randn(size=(batch_size, input_dim), device=device).float()
fake_2 = generator(fake_noise_2)
crit_fake_pred = critic(fake_2)
gen_loss = get_gen_loss(crit_fake_pred)
gen_loss.backward()
# Update the weights
gen_optimizer.step()
###############################
if i + 1 > (epochs - fair_epochs):
# training the generator for fairness
gen_optimizer_fair.zero_grad()
fake_noise_2 = torch.randn(size=(batch_size, input_dim), device=device).float()
fake_2 = generator(fake_noise_2)
crit_fake_pred = critic(fake_2)
gen_fair_loss = second_critic(fake_2, crit_fake_pred, lamda)
gen_fair_loss.backward()
gen_optimizer_fair.step()
cur_step += 1
return generator, critic, ohe, scaler, data_train, data_test, input_dim
def train_plot(df, epochs, batchsize, fair_epochs, lamda):
generator, critic, ohe, scaler, data_train, data_test, input_dim = train(df, epochs, batchsize, fair_epochs, lamda)
return generator, critic, ohe, scaler, data_train, data_test, input_dim
if args.command == "with_fairness":
generator, critic, ohe, scaler, data_train, data_test, input_dim = train_plot(df, args.num_epochs, args.batch_size, args.num_fair_epochs, args.lambda_val)
elif args.command == "no_fairness":
generator, critic, ohe, scaler, data_train, data_test, input_dim = train_plot(df, args.num_epochs, args.batch_size, 0, 0)
fake_numpy_array = generator(torch.randn(size=(args.size_of_fake_data, input_dim), device=device)).cpu().detach().numpy()
fake_df = get_original_data(fake_numpy_array, df, ohe, scaler)
fake_df = fake_df[df.columns]
fake_df.to_csv(args.fake_name, index=False)
```
%% Cell type:code id:8ae0ef89-6190-4f33-927f-3f1e79206786 tags:
``` python
#save file
synthetic_data = pd.read_csv('TabFairGAN_Results/test.csv') # change loation accordingly
```
%% Cell type:code id:cf298876-d496-45ed-b083-01bb162dec27 tags:
``` python
def get_data_info(df):
"""Crates the categorical columns, continuous columns, and metadata of a dataframe.
Args:
df (pandas.Dataframe): The input dataframe containing continuous and categorical values.
Returns:
list: the list of categorical column names. Specifically, columns with only 4 uniques values
list: The list of continuous column names.
metadata: The metadata of the dataframe. for more informatin visit https://docs.sdv.dev/sdv/reference/metadata-spec/single-table-metadata-json
"""
#createing
categorical_columns = ['Label']
continuous_columns = []
for i in df.columns:
if i not in categorical_columns:
continuous_columns.append(i)
#creating metadat
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
for column in categorical_columns:
metadata.update_column(
column_name = column,
sdtype = 'categorical'
)
for column in continuous_columns:
metadata.update_column(
column_name = column,
sdtype = 'numerical'
)
# validating metadata
metadata.validate()
metadata.validate_data(data=real_data)
return categorical_columns, continuous_columns, metadata
categorical_columns, continuous_columns, metadata = get_data_info(real_data)
```
%% Cell type:code id:04e09b22-c681-48ea-b86c-44a643400ebc tags:
``` python
# evaluating synthetic data with table_evaluator cumulative sum per features and distribution
table_evaluator = TableEvaluator(real_data, synthetic_data, cat_cols = categorical_columns)
table_evaluator.visual_evaluation()
```
%% Cell type:code id:ed1cd643-0342-4bfb-a217-443337684a38 tags:
``` python
#saving and visualizing column pair trend and column shapes
metadata = metadata.to_dict()
my_report = QualityReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='TabFairGAN_Results/Bot/quality.pkl')
my_report.get_visualization(property_name='Column Pair Trends')
```
%% Cell type:code id:3a1ed789-0cb8-43c3-b4d6-4b842093717c tags:
``` python
#saving and visualiztation data validity
my_report = DiagnosticReport()
my_report.generate(real_data, synthetic_data, metadata)
my_report.save(filepath='TabFairGAN_Results/Bot/diagnostic.pkl')
my_report.get_visualization('Data Validity')
```
%% Cell type:code id:19b801e3-96cc-474f-971c-cc43990be18d tags:
``` python
#range coverage metric
range_coverage=[]
for i in real_data.columns:
y=RangeCoverage.compute(
real_data=real_data[i],
synthetic_data=synthetic_data[i]
)
range_coverage.append(y)
df = pd.DataFrame(range_coverage, columns=['Range Coverage'])
print(df['Range Coverage'].mean())
```
%% Cell type:code id:5f5fd8e3-8d50-43dc-a1f8-783dae4ae7da tags:
``` python
# checking the number of unique synthetic data instances
df = pd.concat([real_data, synthetic_data], axis=0)
print(df.shape)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
print(df.shape)
```
%% Cell type:code id:08386892-ce3f-45f8-be6e-11f184572a80 tags:
``` python
#Saving the distribution of each column
def sanitize_column_name(column_name):
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return ''.join(c for c in column_name if c in valid_chars)
for i in real_data.columns:
fig = get_column_plot(
real_data=real_data,
synthetic_data=synthetic_data,
column_name=i,
plot_type='bar'
)
sanitized_column_name = sanitize_column_name(i)
# Save the figure in the 'Pics' directory, change the location accordingly
py.write_image(fig, os.path.join('TabFairGAN_Results/Bot/Pics', f"{sanitized_column_name}.png"))
```
%% Cell type:code id:ff374e9c-1e80-4804-be56-c25e2ca2bc3a tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment