Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MasterThesis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Sayed Saeedi
MasterThesis
Commits
d737673d
Commit
d737673d
authored
1 year ago
by
Sayed Saeedi
Browse files
Options
Downloads
Patches
Plain Diff
Code of classifiers RF and XGB
parent
89efa2a9
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Classifiers/Classifiers.ipynb
+410
-0
410 additions, 0 deletions
Classifiers/Classifiers.ipynb
with
410 additions
and
0 deletions
Classifiers/Classifiers.ipynb
0 → 100644
+
410
−
0
View file @
d737673d
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "7d727b21-135b-40f7-89bd-559cd7b2d681",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"%matplotlib inline\n",
"from datetime import datetime\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"from xgboost import XGBClassifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40b9b07a-2b08-4022-9553-d99fd4140647",
"metadata": {},
"outputs": [],
"source": [
"# uncomment which ever is to be trained\n",
"\n",
"#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n",
"#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n",
"#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n",
"#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n",
"#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n",
"#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n",
"#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n",
"\n",
"df.drop_duplicates(inplace=True)\n",
"df.drop(columns='Timestamp', inplace=True)\n",
"\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc1ec957-4b17-4f7b-b2a4-04ab125d7f57",
"metadata": {},
"outputs": [],
"source": [
"#splitting dataset\n",
"\n",
"X= df.drop('Label', axis=1)\n",
"y = df['Label']\n",
"\n",
"# Split the data into training and test sets with stratification\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
"X_train.shape, X_test.shape, y_train.shape, y_test.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4b3769f-d75f-457e-b8c5-ea8cdc7d2190",
"metadata": {},
"outputs": [],
"source": [
"# combining training data for preprocessing\n",
"df= pd.concat([X_train, y_train], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdc2739f-2924-4df4-841c-589f9f79b918",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"#droping constant columns\n",
"\n",
"variances = df.var(numeric_only=True)\n",
"constant_columns = variances[variances == 0].index\n",
"df = df.drop(constant_columns, axis=1)\n",
"\n",
"print(constant_columns)\n",
"print (df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b772bc3-ed7a-4497-bfb5-03561653afc3",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"#droping duplicate columns\n",
"duplicates = set()\n",
"for i in range(0, len(df.columns)):\n",
" col1 = df.columns[i]\n",
" for j in range(i+1, len(df.columns)):\n",
" col2 = df.columns[j]\n",
" if(df[col1].equals(df[col2])):\n",
" duplicates.add(col2)\n",
"\n",
"\n",
"print (duplicates)\n",
"df.drop(duplicates, axis=1, inplace=True)\n",
"print (df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "566564c1-4661-4f07-9a60-5ac0ab836bcf",
"metadata": {},
"outputs": [],
"source": [
"# # pearson correlation heatmap before feature drop\n",
"\n",
"plt.figure(figsize=(70, 70))\n",
"corr = df.corr(numeric_only=True)\n",
"sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8cd75b4a-0bde-455c-bca2-0230d0e582b4",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"#droping highly correlated columns\n",
"correlated_col = set()\n",
"is_correlated = [True] * len(corr.columns)\n",
"threshold = 0.95\n",
"for i in range (len(corr.columns)):\n",
" if(is_correlated[i]):\n",
" for j in range(i):\n",
" if (np.abs(corr.iloc[i, j]) >= threshold) and (is_correlated[j]):\n",
" colname = corr.columns[j]\n",
" is_correlated[j]=False\n",
" correlated_col.add(colname)\n",
"\n",
"print(correlated_col)\n",
"print(len(correlated_col))\n",
"\n",
"df.drop(correlated_col, axis=1, inplace=True)\n",
"print (df.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e425643d-d68d-4a9c-b13f-7aaebed6342e",
"metadata": {},
"outputs": [],
"source": [
"# %%time\n",
"# # pearson correlation heatmap after feature drop\n",
"\n",
"\n",
"# plt.figure(figsize=(70, 70))\n",
"# corr = df.corr(numeric_only=True)\n",
"# sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n",
"# plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41000a4d-fb2a-4067-8402-20dd92a6cdd4",
"metadata": {},
"outputs": [],
"source": [
"#splitting data after feature engineering \n",
"X_train= df.drop('Label', axis=1)\n",
"y_train = df['Label']\n",
"\n",
"\n",
"#ensure test set also has similar columns as train set\n",
"X_test = X_test[X_train.columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e1f3631-9db4-40b0-acb5-50a37e6302cb",
"metadata": {},
"outputs": [],
"source": [
"#label encoding\n",
"#training RF\n",
"\n",
"label_encoder = LabelEncoder()\n",
"y_train = label_encoder.fit_transform(y_train)\n",
"# Create a dictionary mapping original labels to encoded values\n",
"label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
"print(label_mapping)\n",
"\n",
"y_test = label_encoder.fit_transform(y_test)\n",
"\n",
"# Create a dictionary mapping original labels to encoded values\n",
"label_mapping1 = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
"print(label_mapping1)\n",
"\n",
"# # Initialize the RandomForestClassifier\n",
"clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"\n",
"# # Fit the model\n",
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c2daab3-f7b1-41a3-8d55-3b340ffa0340",
"metadata": {},
"outputs": [],
"source": [
"#predict with RF\n",
"\n",
"y_pred = clf.predict(X_test)\n",
"\n",
"# Evaluate the classifier\n",
"print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
"print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
"print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22a5b3e0-713c-4fd4-b94a-7dc6848a6e61",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"#training xgboost\n",
"model = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=30, num_classes=11, subsample=0.5, max_delta_step=1,\n",
" eval_metric=[\"merror\",\"mlogloss\"])\n",
"\n",
"eval_set = [(X_train, y_train), (X_test, y_test)]\n",
"model.fit(X_train, y_train, eval_set=eval_set, verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8c102b2-9785-4edb-bc98-cb8b550778f4",
"metadata": {},
"outputs": [],
"source": [
"#plot XGB losses\n",
"\n",
"results = model.evals_result()\n",
"epochs = len(results['validation_0']['merror'])\n",
"x_axis = range(0, epochs)\n",
"# plot log loss\n",
"fig, ax = pyplot.subplots()\n",
"ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')\n",
"ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')\n",
"ax.legend()\n",
"pyplot.ylabel('Log Loss')\n",
"pyplot.title('XGBoost Log Loss')\n",
"pyplot.show()\n",
"# plot classification error\n",
"fig, ax = pyplot.subplots()\n",
"ax.plot(x_axis, results['validation_0']['merror'], label='Train')\n",
"ax.plot(x_axis, results['validation_1']['merror'], label='Test')\n",
"ax.legend()\n",
"pyplot.ylabel('Classification Error')\n",
"pyplot.title('XGBoost Classification Error')\n",
"pyplot.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58d428ea-e03a-4d48-a2ae-c698b6c1a5b6",
"metadata": {},
"outputs": [],
"source": [
"# make predictions for test data\n",
"y_pred = model.predict(X_test)\n",
"predictions = [round(value) for value in y_pred]\n",
"# evaluate predictions\n",
"accuracy = accuracy_score(y_test, predictions)\n",
"print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce8b94bd-9229-44c6-9779-53862fb23949",
"metadata": {},
"outputs": [],
"source": [
"# Evaluate the classifier\n",
"print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
"print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
"print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d64d7000-f95a-4b28-b140-b026adfbba15",
"metadata": {},
"outputs": [],
"source": [
"#uncomment one and run the rest of the code to see\n",
"#real and each generated dataset\n",
"\n",
"\n",
"#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n",
"#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n",
"#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n",
"#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n",
"#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n",
"#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n",
"#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n",
"\n",
"X_synth = synth[X_train.columns]\n",
"y_synth = synth['Label']\n",
"X_synth.shape, X_train.shape, y_synth.shape, y_train.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e867cb3-d2b5-4d8d-aa47-722e7993100a",
"metadata": {},
"outputs": [],
"source": [
"#labeling encoding \n",
"y_synth.unique()\n",
"y_synth = label_encoder.fit_transform(y_synth)\n",
"# Create a dictionary mapping original labels to encoded values\n",
"label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
"print(label_mapping)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1f0bd457-845b-4426-9499-c5f18e192952",
"metadata": {},
"outputs": [],
"source": [
"#Random Forest\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"y_pred = clf.predict(X_synth)\n",
"\n",
"# Evaluate the classifier\n",
"print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n",
"print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n",
"print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67e5681c-f6ad-41fa-89e5-3a25dcdb09b7",
"metadata": {},
"outputs": [],
"source": [
"#XGBoost\n",
"y_pred = model.predict(X_synth)\n",
"predictions = [round(value) for value in y_pred]\n",
"# evaluate predictions\n",
"accuracy = accuracy_score(y_synth, predictions)\n",
"print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n",
"# Evaluate the classifier\n",
"print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n",
"print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n",
"print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:7d727b21-135b-40f7-89bd-559cd7b2d681 tags:
```
python
import
numpy
as
np
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
import
seaborn
as
sns
%
matplotlib
inline
from
datetime
import
datetime
from
sklearn.model_selection
import
train_test_split
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.preprocessing
import
LabelEncoder
from
sklearn.metrics
import
accuracy_score
,
classification_report
,
confusion_matrix
from
xgboost
import
XGBClassifier
```
%% Cell type:code id:40b9b07a-2b08-4022-9553-d99fd4140647 tags:
```
python
# uncomment which ever is to be trained
#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data
#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')
#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')
#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')
#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')
#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')
#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')
df
.
drop_duplicates
(
inplace
=
True
)
df
.
drop
(
columns
=
'
Timestamp
'
,
inplace
=
True
)
df
.
shape
```
%% Cell type:code id:fc1ec957-4b17-4f7b-b2a4-04ab125d7f57 tags:
```
python
#splitting dataset
X
=
df
.
drop
(
'
Label
'
,
axis
=
1
)
y
=
df
[
'
Label
'
]
# Split the data into training and test sets with stratification
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
42
,
stratify
=
y
)
X_train
.
shape
,
X_test
.
shape
,
y_train
.
shape
,
y_test
.
shape
```
%% Cell type:code id:b4b3769f-d75f-457e-b8c5-ea8cdc7d2190 tags:
```
python
# combining training data for preprocessing
df
=
pd
.
concat
([
X_train
,
y_train
],
axis
=
1
)
```
%% Cell type:code id:bdc2739f-2924-4df4-841c-589f9f79b918 tags:
```
python
%%
time
#droping constant columns
variances
=
df
.
var
(
numeric_only
=
True
)
constant_columns
=
variances
[
variances
==
0
].
index
df
=
df
.
drop
(
constant_columns
,
axis
=
1
)
print
(
constant_columns
)
print
(
df
.
shape
)
```
%% Cell type:code id:3b772bc3-ed7a-4497-bfb5-03561653afc3 tags:
```
python
%%
time
#droping duplicate columns
duplicates
=
set
()
for
i
in
range
(
0
,
len
(
df
.
columns
)):
col1
=
df
.
columns
[
i
]
for
j
in
range
(
i
+
1
,
len
(
df
.
columns
)):
col2
=
df
.
columns
[
j
]
if
(
df
[
col1
].
equals
(
df
[
col2
])):
duplicates
.
add
(
col2
)
print
(
duplicates
)
df
.
drop
(
duplicates
,
axis
=
1
,
inplace
=
True
)
print
(
df
.
shape
)
```
%% Cell type:code id:566564c1-4661-4f07-9a60-5ac0ab836bcf tags:
```
python
# # pearson correlation heatmap before feature drop
plt
.
figure
(
figsize
=
(
70
,
70
))
corr
=
df
.
corr
(
numeric_only
=
True
)
sns
.
heatmap
(
corr
,
annot
=
True
,
cmap
=
'
RdBu
'
,
vmin
=-
1
,
vmax
=
1
,
square
=
True
)
# annot=True
plt
.
show
()
```
%% Cell type:code id:8cd75b4a-0bde-455c-bca2-0230d0e582b4 tags:
```
python
%%
time
#droping highly correlated columns
correlated_col
=
set
()
is_correlated
=
[
True
]
*
len
(
corr
.
columns
)
threshold
=
0.95
for
i
in
range
(
len
(
corr
.
columns
)):
if
(
is_correlated
[
i
]):
for
j
in
range
(
i
):
if
(
np
.
abs
(
corr
.
iloc
[
i
,
j
])
>=
threshold
)
and
(
is_correlated
[
j
]):
colname
=
corr
.
columns
[
j
]
is_correlated
[
j
]
=
False
correlated_col
.
add
(
colname
)
print
(
correlated_col
)
print
(
len
(
correlated_col
))
df
.
drop
(
correlated_col
,
axis
=
1
,
inplace
=
True
)
print
(
df
.
shape
)
```
%% Cell type:code id:e425643d-d68d-4a9c-b13f-7aaebed6342e tags:
```
python
# %%time
# # pearson correlation heatmap after feature drop
# plt.figure(figsize=(70, 70))
# corr = df.corr(numeric_only=True)
# sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True
# plt.show()
```
%% Cell type:code id:41000a4d-fb2a-4067-8402-20dd92a6cdd4 tags:
```
python
#splitting data after feature engineering
X_train
=
df
.
drop
(
'
Label
'
,
axis
=
1
)
y_train
=
df
[
'
Label
'
]
#ensure test set also has similar columns as train set
X_test
=
X_test
[
X_train
.
columns
]
```
%% Cell type:code id:8e1f3631-9db4-40b0-acb5-50a37e6302cb tags:
```
python
#label encoding
#training RF
label_encoder
=
LabelEncoder
()
y_train
=
label_encoder
.
fit_transform
(
y_train
)
# Create a dictionary mapping original labels to encoded values
label_mapping
=
dict
(
zip
(
label_encoder
.
classes_
,
range
(
len
(
label_encoder
.
classes_
))))
print
(
label_mapping
)
y_test
=
label_encoder
.
fit_transform
(
y_test
)
# Create a dictionary mapping original labels to encoded values
label_mapping1
=
dict
(
zip
(
label_encoder
.
classes_
,
range
(
len
(
label_encoder
.
classes_
))))
print
(
label_mapping1
)
# # Initialize the RandomForestClassifier
clf
=
RandomForestClassifier
(
n_estimators
=
100
,
random_state
=
42
)
# # Fit the model
clf
.
fit
(
X_train
,
y_train
)
```
%% Cell type:code id:4c2daab3-f7b1-41a3-8d55-3b340ffa0340 tags:
```
python
#predict with RF
y_pred
=
clf
.
predict
(
X_test
)
# Evaluate the classifier
print
(
"
Accuracy:
"
,
accuracy_score
(
y_test
,
y_pred
))
print
(
"
\n
Classification Report:
\n
"
,
classification_report
(
y_test
,
y_pred
))
print
(
"
\n
Confusion Matrix:
\n
"
,
confusion_matrix
(
y_test
,
y_pred
))
```
%% Cell type:code id:22a5b3e0-713c-4fd4-b94a-7dc6848a6e61 tags:
```
python
#training xgboost
model
=
XGBClassifier
(
max_depth
=
5
,
objective
=
'
multi:softmax
'
,
n_estimators
=
30
,
num_classes
=
11
,
subsample
=
0.5
,
max_delta_step
=
1
,
eval_metric
=
[
"
merror
"
,
"
mlogloss
"
])
eval_set
=
[(
X_train
,
y_train
),
(
X_test
,
y_test
)]
model
.
fit
(
X_train
,
y_train
,
eval_set
=
eval_set
,
verbose
=
True
)
```
%% Cell type:code id:a8c102b2-9785-4edb-bc98-cb8b550778f4 tags:
```
python
#plot XGB losses
results
=
model
.
evals_result
()
epochs
=
len
(
results
[
'
validation_0
'
][
'
merror
'
])
x_axis
=
range
(
0
,
epochs
)
# plot log loss
fig
,
ax
=
pyplot
.
subplots
()
ax
.
plot
(
x_axis
,
results
[
'
validation_0
'
][
'
mlogloss
'
],
label
=
'
Train
'
)
ax
.
plot
(
x_axis
,
results
[
'
validation_1
'
][
'
mlogloss
'
],
label
=
'
Test
'
)
ax
.
legend
()
pyplot
.
ylabel
(
'
Log Loss
'
)
pyplot
.
title
(
'
XGBoost Log Loss
'
)
pyplot
.
show
()
# plot classification error
fig
,
ax
=
pyplot
.
subplots
()
ax
.
plot
(
x_axis
,
results
[
'
validation_0
'
][
'
merror
'
],
label
=
'
Train
'
)
ax
.
plot
(
x_axis
,
results
[
'
validation_1
'
][
'
merror
'
],
label
=
'
Test
'
)
ax
.
legend
()
pyplot
.
ylabel
(
'
Classification Error
'
)
pyplot
.
title
(
'
XGBoost Classification Error
'
)
pyplot
.
show
()
```
%% Cell type:code id:58d428ea-e03a-4d48-a2ae-c698b6c1a5b6 tags:
```
python
# make predictions for test data
y_pred
=
model
.
predict
(
X_test
)
predictions
=
[
round
(
value
)
for
value
in
y_pred
]
# evaluate predictions
accuracy
=
accuracy_score
(
y_test
,
predictions
)
print
(
"
Accuracy: %.2f%%
"
%
(
accuracy
*
100.0
))
```
%% Cell type:code id:ce8b94bd-9229-44c6-9779-53862fb23949 tags:
```
python
# Evaluate the classifier
print
(
"
Accuracy:
"
,
accuracy_score
(
y_test
,
y_pred
))
print
(
"
\n
Classification Report:
\n
"
,
classification_report
(
y_test
,
y_pred
))
print
(
"
\n
Confusion Matrix:
\n
"
,
confusion_matrix
(
y_test
,
y_pred
))
```
%% Cell type:code id:d64d7000-f95a-4b28-b140-b026adfbba15 tags:
```
python
#uncomment one and run the rest of the code to see
#real and each generated dataset
#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data
#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')
#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')
#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')
#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')
#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')
#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')
X_synth
=
synth
[
X_train
.
columns
]
y_synth
=
synth
[
'
Label
'
]
X_synth
.
shape
,
X_train
.
shape
,
y_synth
.
shape
,
y_train
.
shape
```
%% Cell type:code id:9e867cb3-d2b5-4d8d-aa47-722e7993100a tags:
```
python
#labeling encoding
y_synth
.
unique
()
y_synth
=
label_encoder
.
fit_transform
(
y_synth
)
# Create a dictionary mapping original labels to encoded values
label_mapping
=
dict
(
zip
(
label_encoder
.
classes_
,
range
(
len
(
label_encoder
.
classes_
))))
print
(
label_mapping
)
```
%% Cell type:code id:1f0bd457-845b-4426-9499-c5f18e192952 tags:
```
python
#Random Forest
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
accuracy_score
,
classification_report
,
confusion_matrix
y_pred
=
clf
.
predict
(
X_synth
)
# Evaluate the classifier
print
(
"
Accuracy:
"
,
accuracy_score
(
y_synth
,
y_pred
))
print
(
"
\n
Classification Report:
\n
"
,
classification_report
(
y_synth
,
y_pred
))
print
(
"
\n
Confusion Matrix:
\n
"
,
confusion_matrix
(
y_synth
,
y_pred
))
```
%% Cell type:code id:67e5681c-f6ad-41fa-89e5-3a25dcdb09b7 tags:
```
python
#XGBoost
y_pred
=
model
.
predict
(
X_synth
)
predictions
=
[
round
(
value
)
for
value
in
y_pred
]
# evaluate predictions
accuracy
=
accuracy_score
(
y_synth
,
predictions
)
print
(
"
Accuracy: %.2f%%
"
%
(
accuracy
*
100.0
))
# Evaluate the classifier
print
(
"
Accuracy:
"
,
accuracy_score
(
y_synth
,
y_pred
))
print
(
"
\n
Classification Report:
\n
"
,
classification_report
(
y_synth
,
y_pred
))
print
(
"
\n
Confusion Matrix:
\n
"
,
confusion_matrix
(
y_synth
,
y_pred
))
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment