Code of classifiers RF and XGB

d737673d · Sayed Saeedi · 89efa2a9 · d737673d
Commit d737673d authored 1 year ago by Sayed Saeedi
--- a/Classifiers/Classifiers.ipynb
+++ b/Classifiers/Classifiers.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d727b21-135b-40f7-89bd-559cd7b2d681",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "%matplotlib inline\n",
+    "from datetime import datetime\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "from xgboost import XGBClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40b9b07a-2b08-4022-9553-d99fd4140647",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment which ever is to be trained\n",
+    "\n",
+    "#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n",
+    "#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n",
+    "\n",
+    "df.drop_duplicates(inplace=True)\n",
+    "df.drop(columns='Timestamp', inplace=True)\n",
+    "\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc1ec957-4b17-4f7b-b2a4-04ab125d7f57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#splitting dataset\n",
+    "\n",
+    "X= df.drop('Label', axis=1)\n",
+    "y = df['Label']\n",
+    "\n",
+    "# Split the data into training and test sets with stratification\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
+    "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4b3769f-d75f-457e-b8c5-ea8cdc7d2190",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# combining training data for preprocessing\n",
+    "df= pd.concat([X_train, y_train], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdc2739f-2924-4df4-841c-589f9f79b918",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "#droping constant columns\n",
+    "\n",
+    "variances = df.var(numeric_only=True)\n",
+    "constant_columns = variances[variances == 0].index\n",
+    "df = df.drop(constant_columns, axis=1)\n",
+    "\n",
+    "print(constant_columns)\n",
+    "print (df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b772bc3-ed7a-4497-bfb5-03561653afc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "#droping duplicate columns\n",
+    "duplicates = set()\n",
+    "for i in range(0, len(df.columns)):\n",
+    "    col1 = df.columns[i]\n",
+    "    for j in range(i+1, len(df.columns)):\n",
+    "        col2 = df.columns[j]\n",
+    "        if(df[col1].equals(df[col2])):\n",
+    "            duplicates.add(col2)\n",
+    "\n",
+    "\n",
+    "print (duplicates)\n",
+    "df.drop(duplicates, axis=1, inplace=True)\n",
+    "print (df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "566564c1-4661-4f07-9a60-5ac0ab836bcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # pearson correlation heatmap before feature drop\n",
+    "\n",
+    "plt.figure(figsize=(70, 70))\n",
+    "corr = df.corr(numeric_only=True)\n",
+    "sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cd75b4a-0bde-455c-bca2-0230d0e582b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "#droping highly correlated columns\n",
+    "correlated_col = set()\n",
+    "is_correlated = [True] * len(corr.columns)\n",
+    "threshold = 0.95\n",
+    "for i in range (len(corr.columns)):\n",
+    "    if(is_correlated[i]):\n",
+    "        for j in range(i):\n",
+    "            if (np.abs(corr.iloc[i, j]) >= threshold) and (is_correlated[j]):\n",
+    "                colname = corr.columns[j]\n",
+    "                is_correlated[j]=False\n",
+    "                correlated_col.add(colname)\n",
+    "\n",
+    "print(correlated_col)\n",
+    "print(len(correlated_col))\n",
+    "\n",
+    "df.drop(correlated_col, axis=1, inplace=True)\n",
+    "print (df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e425643d-d68d-4a9c-b13f-7aaebed6342e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# # pearson correlation heatmap after feature drop\n",
+    "\n",
+    "\n",
+    "# plt.figure(figsize=(70, 70))\n",
+    "# corr = df.corr(numeric_only=True)\n",
+    "# sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41000a4d-fb2a-4067-8402-20dd92a6cdd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#splitting data after feature engineering \n",
+    "X_train= df.drop('Label', axis=1)\n",
+    "y_train = df['Label']\n",
+    "\n",
+    "\n",
+    "#ensure test set also has similar columns as train set\n",
+    "X_test = X_test[X_train.columns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e1f3631-9db4-40b0-acb5-50a37e6302cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#label encoding\n",
+    "#training RF\n",
+    "\n",
+    "label_encoder = LabelEncoder()\n",
+    "y_train = label_encoder.fit_transform(y_train)\n",
+    "# Create a dictionary mapping original labels to encoded values\n",
+    "label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
+    "print(label_mapping)\n",
+    "\n",
+    "y_test = label_encoder.fit_transform(y_test)\n",
+    "\n",
+    "# Create a dictionary mapping original labels to encoded values\n",
+    "label_mapping1 = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
+    "print(label_mapping1)\n",
+    "\n",
+    "# # Initialize the RandomForestClassifier\n",
+    "clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "\n",
+    "# # Fit the model\n",
+    "clf.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c2daab3-f7b1-41a3-8d55-3b340ffa0340",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#predict with  RF\n",
+    "\n",
+    "y_pred = clf.predict(X_test)\n",
+    "\n",
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22a5b3e0-713c-4fd4-b94a-7dc6848a6e61",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#training xgboost\n",
+    "model = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=30, num_classes=11, subsample=0.5, max_delta_step=1,\n",
+    "                     eval_metric=[\"merror\",\"mlogloss\"])\n",
+    "\n",
+    "eval_set = [(X_train, y_train), (X_test, y_test)]\n",
+    "model.fit(X_train, y_train,  eval_set=eval_set, verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8c102b2-9785-4edb-bc98-cb8b550778f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#plot XGB losses\n",
+    "\n",
+    "results = model.evals_result()\n",
+    "epochs = len(results['validation_0']['merror'])\n",
+    "x_axis = range(0, epochs)\n",
+    "# plot log loss\n",
+    "fig, ax = pyplot.subplots()\n",
+    "ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')\n",
+    "ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')\n",
+    "ax.legend()\n",
+    "pyplot.ylabel('Log Loss')\n",
+    "pyplot.title('XGBoost Log Loss')\n",
+    "pyplot.show()\n",
+    "# plot classification error\n",
+    "fig, ax = pyplot.subplots()\n",
+    "ax.plot(x_axis, results['validation_0']['merror'], label='Train')\n",
+    "ax.plot(x_axis, results['validation_1']['merror'], label='Test')\n",
+    "ax.legend()\n",
+    "pyplot.ylabel('Classification Error')\n",
+    "pyplot.title('XGBoost Classification Error')\n",
+    "pyplot.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58d428ea-e03a-4d48-a2ae-c698b6c1a5b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make predictions for test data\n",
+    "y_pred = model.predict(X_test)\n",
+    "predictions = [round(value) for value in y_pred]\n",
+    "# evaluate predictions\n",
+    "accuracy = accuracy_score(y_test, predictions)\n",
+    "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce8b94bd-9229-44c6-9779-53862fb23949",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d64d7000-f95a-4b28-b140-b026adfbba15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#uncomment one and run the rest of the code to see\n",
+    "#real and each generated dataset\n",
+    "\n",
+    "\n",
+    "#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n",
+    "#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n",
+    "\n",
+    "X_synth = synth[X_train.columns]\n",
+    "y_synth = synth['Label']\n",
+    "X_synth.shape, X_train.shape, y_synth.shape, y_train.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e867cb3-d2b5-4d8d-aa47-722e7993100a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#labeling encoding \n",
+    "y_synth.unique()\n",
+    "y_synth = label_encoder.fit_transform(y_synth)\n",
+    "# Create a dictionary mapping original labels to encoded values\n",
+    "label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
+    "print(label_mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f0bd457-845b-4426-9499-c5f18e192952",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Random Forest\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "y_pred = clf.predict(X_synth)\n",
+    "\n",
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67e5681c-f6ad-41fa-89e5-3a25dcdb09b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#XGBoost\n",
+    "y_pred = model.predict(X_synth)\n",
+    "predictions = [round(value) for value in y_pred]\n",
+    "# evaluate predictions\n",
+    "accuracy = accuracy_score(y_synth, predictions)\n",
+    "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n",
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:7d727b21-135b-40f7-89bd-559cd7b2d681 tags:
+``` python
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+%matplotlib inline
+from datetime import datetime
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from xgboost import XGBClassifier
+```
+%% Cell type:code id:40b9b07a-2b08-4022-9553-d99fd4140647 tags:
+``` python
+# uncomment which ever is to be trained
+#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data
+#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')
+#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')
+#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')
+#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')
+#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')
+#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')
+df.drop_duplicates(inplace=True)
+df.drop(columns='Timestamp', inplace=True)
+df.shape
+```
+%% Cell type:code id:fc1ec957-4b17-4f7b-b2a4-04ab125d7f57 tags:
+``` python
+#splitting dataset
+X= df.drop('Label', axis=1)
+y = df['Label']
+# Split the data into training and test sets with stratification
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+X_train.shape, X_test.shape, y_train.shape, y_test.shape
+```
+%% Cell type:code id:b4b3769f-d75f-457e-b8c5-ea8cdc7d2190 tags:
+``` python
+# combining training data for preprocessing
+df= pd.concat([X_train, y_train], axis=1)
+```
+%% Cell type:code id:bdc2739f-2924-4df4-841c-589f9f79b918 tags:
+``` python
+%%time
+#droping constant columns
+variances = df.var(numeric_only=True)
+constant_columns = variances[variances == 0].index
+df = df.drop(constant_columns, axis=1)
+print(constant_columns)
+print (df.shape)
+```
+%% Cell type:code id:3b772bc3-ed7a-4497-bfb5-03561653afc3 tags:
+``` python
+%%time
+#droping duplicate columns
+duplicates = set()
+for i in range(0, len(df.columns)):
+    col1 = df.columns[i]
+    for j in range(i+1, len(df.columns)):
+        col2 = df.columns[j]
+        if(df[col1].equals(df[col2])):
+            duplicates.add(col2)
+print (duplicates)
+df.drop(duplicates, axis=1, inplace=True)
+print (df.shape)
+```
+%% Cell type:code id:566564c1-4661-4f07-9a60-5ac0ab836bcf tags:
+``` python
+# # pearson correlation heatmap before feature drop
+plt.figure(figsize=(70, 70))
+corr = df.corr(numeric_only=True)
+sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True
+plt.show()
+```
+%% Cell type:code id:8cd75b4a-0bde-455c-bca2-0230d0e582b4 tags:
+``` python
+%%time
+#droping highly correlated columns
+correlated_col = set()
+is_correlated = [True] * len(corr.columns)
+threshold = 0.95
+for i in range (len(corr.columns)):
+    if(is_correlated[i]):
+        for j in range(i):
+            if (np.abs(corr.iloc[i, j]) >= threshold) and (is_correlated[j]):
+                colname = corr.columns[j]
+                is_correlated[j]=False
+                correlated_col.add(colname)
+print(correlated_col)
+print(len(correlated_col))
+df.drop(correlated_col, axis=1, inplace=True)
+print (df.shape)
+```
+%% Cell type:code id:e425643d-d68d-4a9c-b13f-7aaebed6342e tags:
+``` python
+# %%time
+# # pearson correlation heatmap after feature drop
+# plt.figure(figsize=(70, 70))
+# corr = df.corr(numeric_only=True)
+# sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True
+# plt.show()
+```
+%% Cell type:code id:41000a4d-fb2a-4067-8402-20dd92a6cdd4 tags:
+``` python
+#splitting data after feature engineering
+X_train= df.drop('Label', axis=1)
+y_train = df['Label']
+#ensure test set also has similar columns as train set
+X_test = X_test[X_train.columns]
+```
+%% Cell type:code id:8e1f3631-9db4-40b0-acb5-50a37e6302cb tags:
+``` python
+#label encoding
+#training RF
+label_encoder = LabelEncoder()
+y_train = label_encoder.fit_transform(y_train)
+# Create a dictionary mapping original labels to encoded values
+label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
+print(label_mapping)
+y_test = label_encoder.fit_transform(y_test)
+# Create a dictionary mapping original labels to encoded values
+label_mapping1 = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
+print(label_mapping1)
+# # Initialize the RandomForestClassifier
+clf = RandomForestClassifier(n_estimators=100, random_state=42)
+# # Fit the model
+clf.fit(X_train, y_train)
+```
+%% Cell type:code id:4c2daab3-f7b1-41a3-8d55-3b340ffa0340 tags:
+``` python
+#predict with  RF
+y_pred = clf.predict(X_test)
+# Evaluate the classifier
+print("Accuracy:", accuracy_score(y_test, y_pred))
+print("\nClassification Report:\n", classification_report(y_test, y_pred))
+print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
+```
+%% Cell type:code id:22a5b3e0-713c-4fd4-b94a-7dc6848a6e61 tags:
+``` python
+#training xgboost
+model = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=30, num_classes=11, subsample=0.5, max_delta_step=1,
+                     eval_metric=["merror","mlogloss"])
+eval_set = [(X_train, y_train), (X_test, y_test)]
+model.fit(X_train, y_train,  eval_set=eval_set, verbose=True)
+```
+%% Cell type:code id:a8c102b2-9785-4edb-bc98-cb8b550778f4 tags:
+``` python
+#plot XGB losses
+results = model.evals_result()
+epochs = len(results['validation_0']['merror'])
+x_axis = range(0, epochs)
+# plot log loss
+fig, ax = pyplot.subplots()
+ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
+ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
+ax.legend()
+pyplot.ylabel('Log Loss')
+pyplot.title('XGBoost Log Loss')
+pyplot.show()
+# plot classification error
+fig, ax = pyplot.subplots()
+ax.plot(x_axis, results['validation_0']['merror'], label='Train')
+ax.plot(x_axis, results['validation_1']['merror'], label='Test')
+ax.legend()
+pyplot.ylabel('Classification Error')
+pyplot.title('XGBoost Classification Error')
+pyplot.show()
+```
+%% Cell type:code id:58d428ea-e03a-4d48-a2ae-c698b6c1a5b6 tags:
+``` python
+# make predictions for test data
+y_pred = model.predict(X_test)
+predictions = [round(value) for value in y_pred]
+# evaluate predictions
+accuracy = accuracy_score(y_test, predictions)
+print("Accuracy: %.2f%%" % (accuracy * 100.0))
+```
+%% Cell type:code id:ce8b94bd-9229-44c6-9779-53862fb23949 tags:
+``` python
+# Evaluate the classifier
+print("Accuracy:", accuracy_score(y_test, y_pred))
+print("\nClassification Report:\n", classification_report(y_test, y_pred))
+print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
+```
+%% Cell type:code id:d64d7000-f95a-4b28-b140-b026adfbba15 tags:
+``` python
+#uncomment one and run the rest of the code to see
+#real and each generated dataset
+#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data
+#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')
+#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')
+#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')
+#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')
+#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')
+#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')
+X_synth = synth[X_train.columns]
+y_synth = synth['Label']
+X_synth.shape, X_train.shape, y_synth.shape, y_train.shape
+```
+%% Cell type:code id:9e867cb3-d2b5-4d8d-aa47-722e7993100a tags:
+``` python
+#labeling encoding
+y_synth.unique()
+y_synth = label_encoder.fit_transform(y_synth)
+# Create a dictionary mapping original labels to encoded values
+label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
+print(label_mapping)
+```
+%% Cell type:code id:1f0bd457-845b-4426-9499-c5f18e192952 tags:
+``` python
+#Random Forest
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+y_pred = clf.predict(X_synth)
+# Evaluate the classifier
+print("Accuracy:", accuracy_score(y_synth, y_pred))
+print("\nClassification Report:\n", classification_report(y_synth, y_pred))
+print("\nConfusion Matrix:\n", confusion_matrix(y_synth, y_pred))
+```
+%% Cell type:code id:67e5681c-f6ad-41fa-89e5-3a25dcdb09b7 tags:
+``` python
+#XGBoost
+y_pred = model.predict(X_synth)
+predictions = [round(value) for value in y_pred]
+# evaluate predictions
+accuracy = accuracy_score(y_synth, predictions)
+print("Accuracy: %.2f%%" % (accuracy * 100.0))
+# Evaluate the classifier
+print("Accuracy:", accuracy_score(y_synth, y_pred))
+print("\nClassification Report:\n", classification_report(y_synth, y_pred))
+print("\nConfusion Matrix:\n", confusion_matrix(y_synth, y_pred))
+```