From d737673dcfd5a2b878189244c42652a593dc56e4 Mon Sep 17 00:00:00 2001 From: Sayed Saeedi <sayed.saeedi@stud.th-deg.de> Date: Sat, 2 Mar 2024 02:09:35 +0100 Subject: [PATCH] Code of classifiers RF and XGB --- Classifiers/Classifiers.ipynb | 410 ++++++++++++++++++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 Classifiers/Classifiers.ipynb diff --git a/Classifiers/Classifiers.ipynb b/Classifiers/Classifiers.ipynb new file mode 100644 index 0000000..6a230fa --- /dev/null +++ b/Classifiers/Classifiers.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7d727b21-135b-40f7-89bd-559cd7b2d681", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "%matplotlib inline\n", + "from datetime import datetime\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "from xgboost import XGBClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40b9b07a-2b08-4022-9553-d99fd4140647", + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment which ever is to be trained\n", + "\n", + "#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n", + "#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n", + "#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n", + "#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n", + "#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n", + "#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n", + "#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n", + "\n", + "df.drop_duplicates(inplace=True)\n", + "df.drop(columns='Timestamp', inplace=True)\n", + "\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc1ec957-4b17-4f7b-b2a4-04ab125d7f57", + "metadata": {}, + "outputs": [], + "source": [ + "#splitting dataset\n", + "\n", + "X= df.drop('Label', axis=1)\n", + "y = df['Label']\n", + "\n", + "# Split the data into training and test sets with stratification\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4b3769f-d75f-457e-b8c5-ea8cdc7d2190", + "metadata": {}, + "outputs": [], + "source": [ + "# combining training data for preprocessing\n", + "df= pd.concat([X_train, y_train], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdc2739f-2924-4df4-841c-589f9f79b918", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "#droping constant columns\n", + "\n", + "variances = df.var(numeric_only=True)\n", + "constant_columns = variances[variances == 0].index\n", + "df = df.drop(constant_columns, axis=1)\n", + "\n", + "print(constant_columns)\n", + "print (df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b772bc3-ed7a-4497-bfb5-03561653afc3", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "#droping duplicate columns\n", + "duplicates = set()\n", + "for i in range(0, len(df.columns)):\n", + " col1 = df.columns[i]\n", + " for j in range(i+1, len(df.columns)):\n", + " col2 = df.columns[j]\n", + " if(df[col1].equals(df[col2])):\n", + " duplicates.add(col2)\n", + "\n", + "\n", + "print (duplicates)\n", + "df.drop(duplicates, axis=1, inplace=True)\n", + "print (df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "566564c1-4661-4f07-9a60-5ac0ab836bcf", + "metadata": {}, + "outputs": [], + "source": [ + "# # pearson correlation heatmap before feature drop\n", + "\n", + "plt.figure(figsize=(70, 70))\n", + "corr = df.corr(numeric_only=True)\n", + "sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cd75b4a-0bde-455c-bca2-0230d0e582b4", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "#droping highly correlated columns\n", + "correlated_col = set()\n", + "is_correlated = [True] * len(corr.columns)\n", + "threshold = 0.95\n", + "for i in range (len(corr.columns)):\n", + " if(is_correlated[i]):\n", + " for j in range(i):\n", + " if (np.abs(corr.iloc[i, j]) >= threshold) and (is_correlated[j]):\n", + " colname = corr.columns[j]\n", + " is_correlated[j]=False\n", + " correlated_col.add(colname)\n", + "\n", + "print(correlated_col)\n", + "print(len(correlated_col))\n", + "\n", + "df.drop(correlated_col, axis=1, inplace=True)\n", + "print (df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e425643d-d68d-4a9c-b13f-7aaebed6342e", + "metadata": {}, + "outputs": [], + "source": [ + "# %%time\n", + "# # pearson correlation heatmap after feature drop\n", + "\n", + "\n", + "# plt.figure(figsize=(70, 70))\n", + "# corr = df.corr(numeric_only=True)\n", + "# sns.heatmap(corr, annot=True, cmap='RdBu', vmin=-1, vmax=1, square=True) # annot=True\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41000a4d-fb2a-4067-8402-20dd92a6cdd4", + "metadata": {}, + "outputs": [], + "source": [ + "#splitting data after feature engineering \n", + "X_train= df.drop('Label', axis=1)\n", + "y_train = df['Label']\n", + "\n", + "\n", + "#ensure test set also has similar columns as train set\n", + "X_test = X_test[X_train.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e1f3631-9db4-40b0-acb5-50a37e6302cb", + "metadata": {}, + "outputs": [], + "source": [ + "#label encoding\n", + "#training RF\n", + "\n", + "label_encoder = LabelEncoder()\n", + "y_train = label_encoder.fit_transform(y_train)\n", + "# Create a dictionary mapping original labels to encoded values\n", + "label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n", + "print(label_mapping)\n", + "\n", + "y_test = label_encoder.fit_transform(y_test)\n", + "\n", + "# Create a dictionary mapping original labels to encoded values\n", + "label_mapping1 = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n", + "print(label_mapping1)\n", + "\n", + "# # Initialize the RandomForestClassifier\n", + "clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "\n", + "# # Fit the model\n", + "clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c2daab3-f7b1-41a3-8d55-3b340ffa0340", + "metadata": {}, + "outputs": [], + "source": [ + "#predict with RF\n", + "\n", + "y_pred = clf.predict(X_test)\n", + "\n", + "# Evaluate the classifier\n", + "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n", + "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22a5b3e0-713c-4fd4-b94a-7dc6848a6e61", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#training xgboost\n", + "model = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=30, num_classes=11, subsample=0.5, max_delta_step=1,\n", + " eval_metric=[\"merror\",\"mlogloss\"])\n", + "\n", + "eval_set = [(X_train, y_train), (X_test, y_test)]\n", + "model.fit(X_train, y_train, eval_set=eval_set, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8c102b2-9785-4edb-bc98-cb8b550778f4", + "metadata": {}, + "outputs": [], + "source": [ + "#plot XGB losses\n", + "\n", + "results = model.evals_result()\n", + "epochs = len(results['validation_0']['merror'])\n", + "x_axis = range(0, epochs)\n", + "# plot log loss\n", + "fig, ax = pyplot.subplots()\n", + "ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')\n", + "ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')\n", + "ax.legend()\n", + "pyplot.ylabel('Log Loss')\n", + "pyplot.title('XGBoost Log Loss')\n", + "pyplot.show()\n", + "# plot classification error\n", + "fig, ax = pyplot.subplots()\n", + "ax.plot(x_axis, results['validation_0']['merror'], label='Train')\n", + "ax.plot(x_axis, results['validation_1']['merror'], label='Test')\n", + "ax.legend()\n", + "pyplot.ylabel('Classification Error')\n", + "pyplot.title('XGBoost Classification Error')\n", + "pyplot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58d428ea-e03a-4d48-a2ae-c698b6c1a5b6", + "metadata": {}, + "outputs": [], + "source": [ + "# make predictions for test data\n", + "y_pred = model.predict(X_test)\n", + "predictions = [round(value) for value in y_pred]\n", + "# evaluate predictions\n", + "accuracy = accuracy_score(y_test, predictions)\n", + "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce8b94bd-9229-44c6-9779-53862fb23949", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate the classifier\n", + "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n", + "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d64d7000-f95a-4b28-b140-b026adfbba15", + "metadata": {}, + "outputs": [], + "source": [ + "#uncomment one and run the rest of the code to see\n", + "#real and each generated dataset\n", + "\n", + "\n", + "#df = pd.read_csv('CTGAN_Results/real_dataset_all_in_one.csv') # real data\n", + "#pf = pd.read_csv('ADSGAN_Results/ADSGAN_synthetic_all.csv')\n", + "#df = pd.read_csv('CopulaGAN_Results/Copula_synthetic_all.csv')\n", + "#df = pd.read_csv('TVAE_Results/TVAE_synthetic_all.csv')\n", + "#df = pd.read_csv('TabFairGAN_Results/TabFairGAN_synthetic_all.csv')\n", + "#df = pd.read_csv('CTGAN_Results/CTGAN_synthetic_all.csv')\n", + "#df = pd.read_csv('RTVAE_Results/RTVAE_synthetic_all.csv')\n", + "\n", + "X_synth = synth[X_train.columns]\n", + "y_synth = synth['Label']\n", + "X_synth.shape, X_train.shape, y_synth.shape, y_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e867cb3-d2b5-4d8d-aa47-722e7993100a", + "metadata": {}, + "outputs": [], + "source": [ + "#labeling encoding \n", + "y_synth.unique()\n", + "y_synth = label_encoder.fit_transform(y_synth)\n", + "# Create a dictionary mapping original labels to encoded values\n", + "label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))\n", + "print(label_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f0bd457-845b-4426-9499-c5f18e192952", + "metadata": {}, + "outputs": [], + "source": [ + "#Random Forest\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "y_pred = clf.predict(X_synth)\n", + "\n", + "# Evaluate the classifier\n", + "print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n", + "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67e5681c-f6ad-41fa-89e5-3a25dcdb09b7", + "metadata": {}, + "outputs": [], + "source": [ + "#XGBoost\n", + "y_pred = model.predict(X_synth)\n", + "predictions = [round(value) for value in y_pred]\n", + "# evaluate predictions\n", + "accuracy = accuracy_score(y_synth, predictions)\n", + "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n", + "# Evaluate the classifier\n", + "print(\"Accuracy:\", accuracy_score(y_synth, y_pred))\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_synth, y_pred))\n", + "print(\"\\nConfusion Matrix:\\n\", confusion_matrix(y_synth, y_pred))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab