From d737673dcfd5a2b878189244c42652a593dc56e4 Mon Sep 17 00:00:00 2001
From: Sayed Saeedi <sayed.saeedi@stud.th-deg.de>
Date: Sat, 2 Mar 2024 02:09:35 +0100
Subject: [PATCH] Code of classifiers RF and XGB

---
 Classifiers/Classifiers.ipynb | 410 ++++++++++++++++++++++++++++++++++
 1 file changed, 410 insertions(+)
 create mode 100644 Classifiers/Classifiers.ipynb

diff --git a/Classifiers/Classifiers.ipynb b/Classifiers/Classifiers.ipynb
new file mode 100644
index 0000000..6a230fa
--- /dev/null
+++ b/Classifiers/Classifiers.ipynb
@@ -0,0 +1,410 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d727b21-135b-40f7-89bd-559cd7b2d681",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "%matplotlib inline\n",
+    "from datetime import datetime\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "from xgboost import XGBClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40b9b07a-2b08-4022-9553-d99fd4140647",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment which ever is to be trained\n",
+    "\n",
+    "#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n",
+    "#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n",
+    "\n",
+    "df.drop_duplicates(inplace=True)\n",
+    "df.drop(columns='Timestamp', inplace=True)\n",
+    "\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc1ec957-4b17-4f7b-b2a4-04ab125d7f57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#splitting dataset\n",
+    "\n",
+    "X= df.drop('Label', axis=1)\n",
+    "y = df['Label']\n",
+    "\n",
+    "# Split the data into training and test sets with stratification\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
+    "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4b3769f-d75f-457e-b8c5-ea8cdc7d2190",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# combining training data for preprocessing\n",
+    "df= pd.concat([X_train, y_train], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdc2739f-2924-4df4-841c-589f9f79b918",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "#droping constant columns\n",
+    "\n",
+    "variances = df.var(numeric_only=True)\n",
+    "constant_columns = variances[variances == 0].index\n",
+    "df = df.drop(constant_columns, axis=1)\n",
+    "\n",
+    "print(constant_columns)\n",
+    "print (df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b772bc3-ed7a-4497-bfb5-03561653afc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "#droping duplicate columns\n",
+    "duplicates = set()\n",
+    "for i in range(0, len(df.columns)):\n",
+    "    col1 = df.columns[i]\n",
+    "    for j in range(i+1, len(df.columns)):\n",
+    "        col2 = df.columns[j]\n",
+    "        if(df[col1].equals(df[col2])):\n",
+    "            duplicates.add(col2)\n",
+    "\n",
+    "\n",
+    "print (duplicates)\n",
+    "df.drop(duplicates, axis=1, inplace=True)\n",
+    "print (df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "566564c1-4661-4f07-9a60-5ac0ab836bcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # pearson correlation heatmap before feature drop\n",
+    "\n",
+    "plt.figure(figsize=(70, 70))\n",
+    "corr = df.corr(numeric_only=True)\n",
+    "sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cd75b4a-0bde-455c-bca2-0230d0e582b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "#droping highly correlated columns\n",
+    "correlated_col = set()\n",
+    "is_correlated = [True] * len(corr.columns)\n",
+    "threshold = 0.95\n",
+    "for i in range (len(corr.columns)):\n",
+    "    if(is_correlated[i]):\n",
+    "        for j in range(i):\n",
+    "            if (np.abs(corr.iloc[i, j]) >= threshold) and (is_correlated[j]):\n",
+    "                colname = corr.columns[j]\n",
+    "                is_correlated[j]=False\n",
+    "                correlated_col.add(colname)\n",
+    "\n",
+    "print(correlated_col)\n",
+    "print(len(correlated_col))\n",
+    "\n",
+    "df.drop(correlated_col, axis=1, inplace=True)\n",
+    "print (df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e425643d-d68d-4a9c-b13f-7aaebed6342e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# # pearson correlation heatmap after feature drop\n",
+    "\n",
+    "\n",
+    "# plt.figure(figsize=(70, 70))\n",
+    "# corr = df.corr(numeric_only=True)\n",
+    "# sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41000a4d-fb2a-4067-8402-20dd92a6cdd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#splitting data after feature engineering \n",
+    "X_train= df.drop('Label', axis=1)\n",
+    "y_train = df['Label']\n",
+    "\n",
+    "\n",
+    "#ensure test set also has similar columns as train set\n",
+    "X_test = X_test[X_train.columns]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e1f3631-9db4-40b0-acb5-50a37e6302cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#label encoding\n",
+    "#training RF\n",
+    "\n",
+    "label_encoder = LabelEncoder()\n",
+    "y_train = label_encoder.fit_transform(y_train)\n",
+    "# Create a dictionary mapping original labels to encoded values\n",
+    "label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
+    "print(label_mapping)\n",
+    "\n",
+    "y_test = label_encoder.fit_transform(y_test)\n",
+    "\n",
+    "# Create a dictionary mapping original labels to encoded values\n",
+    "label_mapping1 = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
+    "print(label_mapping1)\n",
+    "\n",
+    "# # Initialize the RandomForestClassifier\n",
+    "clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "\n",
+    "# # Fit the model\n",
+    "clf.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c2daab3-f7b1-41a3-8d55-3b340ffa0340",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#predict with  RF\n",
+    "\n",
+    "y_pred = clf.predict(X_test)\n",
+    "\n",
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22a5b3e0-713c-4fd4-b94a-7dc6848a6e61",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#training xgboost\n",
+    "model = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=30, num_classes=11, subsample=0.5, max_delta_step=1,\n",
+    "                     eval_metric=[\"merror\",\"mlogloss\"])\n",
+    "\n",
+    "eval_set = [(X_train, y_train), (X_test, y_test)]\n",
+    "model.fit(X_train, y_train,  eval_set=eval_set, verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8c102b2-9785-4edb-bc98-cb8b550778f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#plot XGB losses\n",
+    "\n",
+    "results = model.evals_result()\n",
+    "epochs = len(results['validation_0']['merror'])\n",
+    "x_axis = range(0, epochs)\n",
+    "# plot log loss\n",
+    "fig, ax = pyplot.subplots()\n",
+    "ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')\n",
+    "ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')\n",
+    "ax.legend()\n",
+    "pyplot.ylabel('Log Loss')\n",
+    "pyplot.title('XGBoost Log Loss')\n",
+    "pyplot.show()\n",
+    "# plot classification error\n",
+    "fig, ax = pyplot.subplots()\n",
+    "ax.plot(x_axis, results['validation_0']['merror'], label='Train')\n",
+    "ax.plot(x_axis, results['validation_1']['merror'], label='Test')\n",
+    "ax.legend()\n",
+    "pyplot.ylabel('Classification Error')\n",
+    "pyplot.title('XGBoost Classification Error')\n",
+    "pyplot.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58d428ea-e03a-4d48-a2ae-c698b6c1a5b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make predictions for test data\n",
+    "y_pred = model.predict(X_test)\n",
+    "predictions = [round(value) for value in y_pred]\n",
+    "# evaluate predictions\n",
+    "accuracy = accuracy_score(y_test, predictions)\n",
+    "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce8b94bd-9229-44c6-9779-53862fb23949",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d64d7000-f95a-4b28-b140-b026adfbba15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#uncomment one and run the rest of the code to see\n",
+    "#real and each generated dataset\n",
+    "\n",
+    "\n",
+    "#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n",
+    "#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n",
+    "#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n",
+    "#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n",
+    "\n",
+    "X_synth = synth[X_train.columns]\n",
+    "y_synth = synth['Label']\n",
+    "X_synth.shape, X_train.shape, y_synth.shape, y_train.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e867cb3-d2b5-4d8d-aa47-722e7993100a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#labeling encoding \n",
+    "y_synth.unique()\n",
+    "y_synth = label_encoder.fit_transform(y_synth)\n",
+    "# Create a dictionary mapping original labels to encoded values\n",
+    "label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n",
+    "print(label_mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f0bd457-845b-4426-9499-c5f18e192952",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Random Forest\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "y_pred = clf.predict(X_synth)\n",
+    "\n",
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67e5681c-f6ad-41fa-89e5-3a25dcdb09b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#XGBoost\n",
+    "y_pred = model.predict(X_synth)\n",
+    "predictions = [round(value) for value in y_pred]\n",
+    "# evaluate predictions\n",
+    "accuracy = accuracy_score(y_synth, predictions)\n",
+    "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n",
+    "# Evaluate the classifier\n",
+    "print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n",
+    "print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n",
+    "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab