Add data for 2019

8863e134 · Fadi Gattoussi · Michael Mutote · f2732caa · 8863e134 · 8863e134
Commit 8863e134 authored 1 year ago by Fadi Gattoussi Committed by Michael Mutote 1 year ago
--- a/angie.pkl
+++ b/angie.pkl
--- a/fadi.joblib
+++ b/fadi.joblib
--- a/training.ipynb
+++ b/training.ipynb
--- a/training2.ipynb
+++ b/training2.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:07:42.460196100Z",
+     "start_time": "2023-12-19T19:07:37.812448500Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import sklearn as sk\n",
+    "import matplotlib.pyplot as plt\n",
+    "import json\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:12:53.871500800Z",
+     "start_time": "2023-12-19T19:12:31.371636500Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cf_2018 = pd.read_csv('flight_data/Combined_Flights_2018.csv')\n",
+    "# cf_2019 = pd.read_csv('flight_data/Combined_Flights_2019.csv')\n",
+    "# combined_data = pd.concat([cf_2018, cf_2019])\n",
+    "# cf_2018 = combined_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:12:58.721917100Z",
+     "start_time": "2023-12-19T19:12:58.100801800Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Filter the dataframe to include only the delays from JFK\n",
+    "import RegressionModel\n",
+    "\n",
+    "filtered_df = cf_2018[(cf_2018['Origin'] == 'JFK')].copy()\n",
+    "\n",
+    "RegressionModel.destinations = list(cf_2018['DestCityName'].unique())\n",
+    "\n",
+    "# filtered_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:41.065489500Z",
+     "start_time": "2023-12-19T19:28:41.051898700Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "relevant_columns = ['FlightDate',  'AirTime',  'DOT_ID_Operating_Airline', 'DestAirportID','DepDelayMinutes', 'ArrDelayMinutes']\n",
+    "jfk_flights_2018 = filtered_df[relevant_columns].copy()\n",
+    "jfk_flights_2018.dropna(inplace=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:41.793062300Z",
+     "start_time": "2023-12-19T19:28:41.766357100Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# jfk_flights_2018"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:42.540461600Z",
+     "start_time": "2023-12-19T19:28:42.523178400Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Convert 'FlightDate' into numerical components\n",
+    "jfk_flights_2018['FlightDate'] = pd.to_datetime(jfk_flights_2018['FlightDate'])\n",
+    "# jfk_flights_2018['Year'] = jfk_flights_2018['FlightDate'].dt.year\n",
+    "jfk_flights_2018['Month'] = jfk_flights_2018['FlightDate'].dt.month\n",
+    "jfk_flights_2018['Day'] = jfk_flights_2018['FlightDate'].dt.day\n",
+    "\n",
+    "jfk_flights_2018 = jfk_flights_2018.dropna()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:43.294133400Z",
+     "start_time": "2023-12-19T19:28:43.277070800Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "      AirTime  DOT_ID_Operating_Airline  DestAirportID  Month\n5544    336.0                     20409          14831      1\n5547    182.0                     20409          13495      1\n5548    124.0                     20409          12451      1\n5554     50.0                     20409          14576      1\n5565     54.0                     20409          10792      1",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>AirTime</th>\n      <th>DOT_ID_Operating_Airline</th>\n      <th>DestAirportID</th>\n      <th>Month</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>5544</th>\n      <td>336.0</td>\n      <td>20409</td>\n      <td>14831</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>5547</th>\n      <td>182.0</td>\n      <td>20409</td>\n      <td>13495</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>5548</th>\n      <td>124.0</td>\n      <td>20409</td>\n      <td>12451</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>5554</th>\n      <td>50.0</td>\n      <td>20409</td>\n      <td>14576</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>5565</th>\n      <td>54.0</td>\n      <td>20409</td>\n      <td>10792</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Choose your target variable, e.g., 'DepDelayMinutes'\n",
+    "X = jfk_flights_2018.drop('DepDelayMinutes', axis=1)\n",
+    "X = X.drop(['ArrDelayMinutes'], axis=1)\n",
+    "X = X.drop([\"FlightDate\"], axis=1)\n",
+    "# y = jfk_flights_2018[['DepDelayMinutes', 'ArrDelayMinutes']]\n",
+    "y = jfk_flights_2018['ArrDelayMinutes']\n",
+    "\n",
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:44.615901600Z",
+     "start_time": "2023-12-19T19:28:44.499070300Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# Splitting the dataset into training and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "\n",
+    "X_train = X_train.apply(pd.to_numeric, errors='coerce')\n",
+    "y_train = y_train.apply(pd.to_numeric, errors='coerce')\n",
+    "X_test = X_test.apply(pd.to_numeric, errors='coerce')\n",
+    "y_test = y_test.apply(pd.to_numeric, errors='coerce')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:55.012761300Z",
+     "start_time": "2023-12-19T19:28:45.594784400Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "RandomForestRegressor(random_state=42)",
+      "text/html": "<style>#sk-container-id-10 {color: black;}#sk-container-id-10 pre{padding: 0;}#sk-container-id-10 div.sk-toggleable {background-color: white;}#sk-container-id-10 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-10 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-10 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-10 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-10 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-10 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-10 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-10 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-10 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-10 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-10 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-10 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-10 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-10 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-10 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-10 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-10 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-10 div.sk-item {position: relative;z-index: 1;}#sk-container-id-10 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-10 div.sk-item::before, #sk-container-id-10 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-10 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-10 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-10 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-10 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-10 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-10 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-10 div.sk-label-container {text-align: center;}#sk-container-id-10 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-10 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-10\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestRegressor(random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-10\" type=\"checkbox\" checked><label for=\"sk-estimator-id-10\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestRegressor</label><div class=\"sk-toggleable__content\"><pre>RandomForestRegressor(random_state=42)</pre></div></div></div></div></div>"
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "# Initialize the model\n",
+    "model = LinearRegression()\n",
+    "model2 = RandomForestRegressor(n_estimators=100,random_state=42)\n",
+    "\n",
+    "# Train the model\n",
+    "model.fit(X_train, y_train)\n",
+    "model2.fit(X_train, y_train)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:56.633803400Z",
+     "start_time": "2023-12-19T19:28:56.233809100Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Squared Error: 2100.674655688025\n",
+      "MSE 2  2540.4206764113023\n",
+      "R^2 Score: 0.0025435901788870563\n",
+      "R^2 Score: -0.20625956069270712\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import mean_squared_error, r2_score\n",
+    "\n",
+    "# Predict on the test set\n",
+    "y_pred = model.predict(X_test)\n",
+    "y_pred2 = model2.predict(X_test)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "mse = mean_squared_error(y_test, y_pred)\n",
+    "mse2 = mean_squared_error(y_test, y_pred2)\n",
+    "r2 = r2_score(y_test, y_pred)\n",
+    "r22 = r2_score(y_test, y_pred2)\n",
+    "print(\"Mean Squared Error:\", mse)\n",
+    "print(\"MSE 2 \", mse2)\n",
+    "print(\"R^2 Score:\", r2)\n",
+    "print(\"R^2 Score:\", r22)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "['C:/Users/s2080/PycharmProjects/ws-23-sas-02/fadi.joblib']"
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from joblib import dump\n",
+    "dump(model, 'C:/Users/s2080/PycharmProjects/ws-23-sas-02/fadi.joblib')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:59.088599400Z",
+     "start_time": "2023-12-19T19:28:59.083322200Z"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "outputs": [],
+   "source": [
+    "from joblib import load\n",
+    "model = load('fadi.joblib')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:28:59.878613100Z",
+     "start_time": "2023-12-19T19:28:59.869948100Z"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "outputs": [],
+   "source": [
+    "jfk_flights_2018.to_pickle('C:/Users/s2080/PycharmProjects/ws-23-sas-02/angie.pkl')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:29:00.609582Z",
+     "start_time": "2023-12-19T19:29:00.601911100Z"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "outputs": [],
+   "source": [
+    "jfk_flights_2018 = pd.read_pickle(\"angie.pkl\")\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-12-19T19:23:51.598644400Z",
+     "start_time": "2023-12-19T19:23:51.582771600Z"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# jfk_unpickle.head()"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sas2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import sklearn as sk
+import matplotlib.pyplot as plt
+import json
+import math
+```
+
+%% Cell type:code id: tags:
+
+``` python
+cf_2018 = pd.read_csv('flight_data/Combined_Flights_2018.csv')
+# cf_2019 = pd.read_csv('flight_data/Combined_Flights_2019.csv')
+# combined_data = pd.concat([cf_2018, cf_2019])
+# cf_2018 = combined_data
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Filter the dataframe to include only the delays from JFK
+import RegressionModel
+
+filtered_df = cf_2018[(cf_2018['Origin'] == 'JFK')].copy()
+
+RegressionModel.destinations = list(cf_2018['DestCityName'].unique())
+
+# filtered_df
+```
+
+%% Cell type:code id: tags:
+
+``` python
+relevant_columns = ['FlightDate',  'AirTime',  'DOT_ID_Operating_Airline', 'DestAirportID','DepDelayMinutes', 'ArrDelayMinutes']
+jfk_flights_2018 = filtered_df[relevant_columns].copy()
+jfk_flights_2018.dropna(inplace=True)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# jfk_flights_2018
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Convert 'FlightDate' into numerical components
+jfk_flights_2018['FlightDate'] = pd.to_datetime(jfk_flights_2018['FlightDate'])
+# jfk_flights_2018['Year'] = jfk_flights_2018['FlightDate'].dt.year
+jfk_flights_2018['Month'] = jfk_flights_2018['FlightDate'].dt.month
+jfk_flights_2018['Day'] = jfk_flights_2018['FlightDate'].dt.day
+
+jfk_flights_2018 = jfk_flights_2018.dropna()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from sklearn.model_selection import train_test_split
+
+# Choose your target variable, e.g., 'DepDelayMinutes'
+X = jfk_flights_2018.drop('DepDelayMinutes', axis=1)
+X = X.drop(['ArrDelayMinutes'], axis=1)
+X = X.drop(["FlightDate"], axis=1)
+# y = jfk_flights_2018[['DepDelayMinutes', 'ArrDelayMinutes']]
+y = jfk_flights_2018['ArrDelayMinutes']
+
+X.head()
+```
+
+%% Output
+
+      AirTime  DOT_ID_Operating_Airline  DestAirportID  Month
+5544    336.0                     20409          14831      1
+5547    182.0                     20409          13495      1
+5548    124.0                     20409          12451      1
+5554     50.0                     20409          14576      1
+5565     54.0                     20409          10792      1
+
+%% Cell type:code id: tags:
+
+``` python
+
+
+# Splitting the dataset into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+X_train = X_train.apply(pd.to_numeric, errors='coerce')
+y_train = y_train.apply(pd.to_numeric, errors='coerce')
+X_test = X_test.apply(pd.to_numeric, errors='coerce')
+y_test = y_test.apply(pd.to_numeric, errors='coerce')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+# Initialize the model
+model = LinearRegression()
+model2 = RandomForestRegressor(n_estimators=100,random_state=42)
+
+# Train the model
+model.fit(X_train, y_train)
+model2.fit(X_train, y_train)
+```
+
+%% Output
+
+RandomForestRegressor(random_state=42)
+
+%% Cell type:code id: tags:
+
+``` python
+from sklearn.metrics import mean_squared_error, r2_score
+
+# Predict on the test set
+y_pred = model.predict(X_test)
+y_pred2 = model2.predict(X_test)
+
+# Evaluate the model
+mse = mean_squared_error(y_test, y_pred)
+mse2 = mean_squared_error(y_test, y_pred2)
+r2 = r2_score(y_test, y_pred)
+r22 = r2_score(y_test, y_pred2)
+print("Mean Squared Error:", mse)
+print("MSE 2 ", mse2)
+print("R^2 Score:", r2)
+print("R^2 Score:", r22)
+```
+
+%% Output
+
+    Mean Squared Error: 2100.674655688025
+    MSE 2  2540.4206764113023
+    R^2 Score: 0.0025435901788870563
+    R^2 Score: -0.20625956069270712
+
+%% Cell type:code id: tags:
+
+``` python
+from joblib import dump
+dump(model, 'C:/Users/s2080/PycharmProjects/ws-23-sas-02/fadi.joblib')
+```
+
+%% Output
+
+['C:/Users/s2080/PycharmProjects/ws-23-sas-02/fadi.joblib']
+
+%% Cell type:code id: tags:
+
+``` python
+from joblib import load
+model = load('fadi.joblib')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+jfk_flights_2018.to_pickle('C:/Users/s2080/PycharmProjects/ws-23-sas-02/angie.pkl')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+jfk_flights_2018 = pd.read_pickle("angie.pkl")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# jfk_unpickle.head()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```