Skip to content
Snippets Groups Projects
Commit d737673d authored by Sayed Saeedi's avatar Sayed Saeedi
Browse files

Code of classifiers RF and XGB

parent 89efa2a9
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:7d727b21-135b-40f7-89bd-559cd7b2d681 tags:
``` python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
```
%% Cell type:code id:40b9b07a-2b08-4022-9553-d99fd4140647 tags:
``` python
# uncomment which ever is to be trained
#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data
#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')
#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')
#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')
#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')
#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')
#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')
df.drop_duplicates(inplace=True)
df.drop(columns='Timestamp', inplace=True)
df.shape
```
%% Cell type:code id:fc1ec957-4b17-4f7b-b2a4-04ab125d7f57 tags:
``` python
#splitting dataset
X= df.drop('Label', axis=1)
y = df['Label']
# Split the data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
```
%% Cell type:code id:b4b3769f-d75f-457e-b8c5-ea8cdc7d2190 tags:
``` python
# combining training data for preprocessing
df= pd.concat([X_train, y_train], axis=1)
```
%% Cell type:code id:bdc2739f-2924-4df4-841c-589f9f79b918 tags:
``` python
%%time
#droping constant columns
variances = df.var(numeric_only=True)
constant_columns = variances[variances == 0].index
df = df.drop(constant_columns, axis=1)
print(constant_columns)
print (df.shape)
```
%% Cell type:code id:3b772bc3-ed7a-4497-bfb5-03561653afc3 tags:
``` python
%%time
#droping duplicate columns
duplicates = set()
for i in range(0, len(df.columns)):
col1 = df.columns[i]
for j in range(i+1, len(df.columns)):
col2 = df.columns[j]
if(df[col1].equals(df[col2])):
duplicates.add(col2)
print (duplicates)
df.drop(duplicates, axis=1, inplace=True)
print (df.shape)
```
%% Cell type:code id:566564c1-4661-4f07-9a60-5ac0ab836bcf tags:
``` python
# # pearson correlation heatmap before feature drop
plt.figure(figsize=(70, 70))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True
plt.show()
```
%% Cell type:code id:8cd75b4a-0bde-455c-bca2-0230d0e582b4 tags:
``` python
%%time
#droping highly correlated columns
correlated_col = set()
is_correlated = [True] * len(corr.columns)
threshold = 0.95
for i in range (len(corr.columns)):
if(is_correlated[i]):
for j in range(i):
if (np.abs(corr.iloc[i, j]) >= threshold) and (is_correlated[j]):
colname = corr.columns[j]
is_correlated[j]=False
correlated_col.add(colname)
print(correlated_col)
print(len(correlated_col))
df.drop(correlated_col, axis=1, inplace=True)
print (df.shape)
```
%% Cell type:code id:e425643d-d68d-4a9c-b13f-7aaebed6342e tags:
``` python
# %%time
# # pearson correlation heatmap after feature drop
# plt.figure(figsize=(70, 70))
# corr = df.corr(numeric_only=True)
# sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True
# plt.show()
```
%% Cell type:code id:41000a4d-fb2a-4067-8402-20dd92a6cdd4 tags:
``` python
#splitting data after feature engineering
X_train= df.drop('Label', axis=1)
y_train = df['Label']
#ensure test set also has similar columns as train set
X_test = X_test[X_train.columns]
```
%% Cell type:code id:8e1f3631-9db4-40b0-acb5-50a37e6302cb tags:
``` python
#label encoding
#training RF
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
# Create a dictionary mapping original labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print(label_mapping)
y_test = label_encoder.fit_transform(y_test)
# Create a dictionary mapping original labels to encoded values
label_mapping1 = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print(label_mapping1)
# # Initialize the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
# # Fit the model
clf.fit(X_train, y_train)
```
%% Cell type:code id:4c2daab3-f7b1-41a3-8d55-3b340ffa0340 tags:
``` python
#predict with RF
y_pred = clf.predict(X_test)
# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
```
%% Cell type:code id:22a5b3e0-713c-4fd4-b94a-7dc6848a6e61 tags:
``` python
#training xgboost
model = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=30, num_classes=11, subsample=0.5, max_delta_step=1,
eval_metric=["merror","mlogloss"])
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, eval_set=eval_set, verbose=True)
```
%% Cell type:code id:a8c102b2-9785-4edb-bc98-cb8b550778f4 tags:
``` python
#plot XGB losses
results = model.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()
```
%% Cell type:code id:58d428ea-e03a-4d48-a2ae-c698b6c1a5b6 tags:
``` python
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
```
%% Cell type:code id:ce8b94bd-9229-44c6-9779-53862fb23949 tags:
``` python
# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
```
%% Cell type:code id:d64d7000-f95a-4b28-b140-b026adfbba15 tags:
``` python
#uncomment one and run the rest of the code to see
#real and each generated dataset
#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data
#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')
#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')
#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')
#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')
#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')
#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')
X_synth = synth[X_train.columns]
y_synth = synth['Label']
X_synth.shape, X_train.shape, y_synth.shape, y_train.shape
```
%% Cell type:code id:9e867cb3-d2b5-4d8d-aa47-722e7993100a tags:
``` python
#labeling encoding
y_synth.unique()
y_synth = label_encoder.fit_transform(y_synth)
# Create a dictionary mapping original labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print(label_mapping)
```
%% Cell type:code id:1f0bd457-845b-4426-9499-c5f18e192952 tags:
``` python
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = clf.predict(X_synth)
# Evaluate the classifier
print("Accuracy:", accuracy_score(y_synth, y_pred))
print("\nClassification Report:\n", classification_report(y_synth, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_synth, y_pred))
```
%% Cell type:code id:67e5681c-f6ad-41fa-89e5-3a25dcdb09b7 tags:
``` python
#XGBoost
y_pred = model.predict(X_synth)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_synth, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# Evaluate the classifier
print("Accuracy:", accuracy_score(y_synth, y_pred))
print("\nClassification Report:\n", classification_report(y_synth, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_synth, y_pred))
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment