Remove track-specific hyperparameter optimizations, add raw data output

2026-02-05 00:06:39 -05:00 · 2024-12-10 19:49:38 -05:00
parent b6029eb929
commit 890bbeb0f7
1 changed files with 314 additions and 159 deletions
--- a/project/Modeling.ipynb
+++ b/project/Modeling.ipynb
@@ -48,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -63,28 +63,19 @@
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import xgboost as xgb\n",
    "from catboost import CatBoostRegressor\n",
    "from fastf1.ergast.structure import FastestLap\n",
    "from lightgbm import LGBMRegressor\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n",
    "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.linear_model import Lasso, LinearRegression, Ridge\n",
+    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
-    "from sklearn.metrics import (make_scorer, mean_absolute_error,\n",
+    "from sklearn.model_selection import train_test_split\n",
-    "                             mean_squared_error, r2_score)\n",
+    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import (cross_val_score, cross_validate,\n",
    "                                     train_test_split)\n",
    "from sklearn.pipeline import Pipeline, make_pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from xgboost import XGBRegressor"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@@ -816,7 +807,7 @@
      "24  2024-12-08 17:00:00+04:00 2024-12-08 13:00:00         True  \n",
      "\n",
      "[25 rows x 23 columns]\n",
-      "{'Session2DateUtc', 'Session2Date', 'OfficialEventName', 'Session2', 'EventName', 'RoundNumber', 'Session5DateUtc', 'Session1DateUtc', 'Session4DateUtc', 'Session4Date', 'Session3Date', 'F1ApiSupport', 'Country', 'Session5', 'EventFormat', 'Session3DateUtc', 'Session4', 'Session1', 'Session1Date', 'Session3', 'EventDate', 'Location', 'Session5Date'}\n"
+      "{'RoundNumber', 'Location', 'Session5Date', 'Session3', 'EventName', 'OfficialEventName', 'Session5', 'Session2DateUtc', 'Session2Date', 'Session5DateUtc', 'Session1', 'Session2', 'Session1DateUtc', 'Session4Date', 'Session3DateUtc', 'EventDate', 'F1ApiSupport', 'Session1Date', 'Country', 'Session3Date', 'Session4', 'EventFormat', 'Session4DateUtc'}\n"
     ]
    }
   ],
@@ -848,7 +839,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -860,7 +851,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@@ -922,22 +913,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Processing 2022 Dutch Grand Prix - Race\n"
+      "Processing 2022 Dutch Grand Prix - Race\n",
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "_api        WARNING \tDriver 241: Position data is incomplete!\n",
      "_api        WARNING \tDriver 242: Position data is incomplete!\n",
      "_api        WARNING \tDriver 243: Position data is incomplete!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing 2022 Italian Grand Prix - Race\n",
      "Processing 2022 Austrian Grand Prix - Race\n",
      "Processing 2022 Hungarian Grand Prix - Race\n",
@@ -973,54 +949,11 @@
     "text": [
      "Processing 2022 Sao Paulo Grand Prix - Race\n",
      "Processing 2023 Bahrain Grand Prix - Race\n",
-      "Processing 2023 Saudi Arabian Grand Prix - Race\n"
+      "Processing 2023 Saudi Arabian Grand Prix - Race\n",
-     ]
+      "Processing 2023 Dutch Grand Prix - Race\n",
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "_api        WARNING \tDriver 241: Position data is incomplete!\n",
      "_api        WARNING \tDriver 242: Position data is incomplete!\n",
      "_api        WARNING \tDriver 243: Position data is incomplete!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing 2023 Dutch Grand Prix - Race\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "_api        WARNING \tDriver 241: Position data is incomplete!\n",
      "_api        WARNING \tDriver 242: Position data is incomplete!\n",
      "_api        WARNING \tDriver 243: Position data is incomplete!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing 2023 Italian Grand Prix - Race\n",
      "Processing 2023 Austrian Grand Prix - Race\n",
-      "Processing 2023 Hungarian Grand Prix - Race\n"
+      "Processing 2023 Hungarian Grand Prix - Race\n",
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "_api        WARNING \tSkipping lap alignment (no suitable lap)!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing 2023 British Grand Prix - Race\n",
      "Processing 2023 Belgian Grand Prix - Race\n"
     ]
@@ -1202,7 +1135,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1289,7 +1222,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1323,77 +1256,7 @@
    "            'feature_fraction': 0.8,\n",
    "            'bagging_fraction': 0.8,\n",
    "            'bagging_freq': 5\n",
-    "        },\n",
+    "        }\n",
    "        # 'British': {\n",
    "        #     'n_estimators': 500,\n",
    "        #     'max_depth': 8,\n",
    "        #     'learning_rate': 0.002,\n",
    "        #     'min_child_samples': 30,\n",
    "        #     'subsample': 0.75,\n",
    "        #     'colsample_bytree': 0.75,\n",
    "        #     'reg_alpha': 0.3,\n",
    "        #     'reg_lambda': 2.0,\n",
    "        #     'num_leaves': 30,\n",
    "        #     'feature_fraction': 0.7,\n",
    "        #     'bagging_fraction': 0.7,\n",
    "        #     'bagging_freq': 7\n",
    "        # },\n",
    "        # 'Bahrain': {\n",
    "        #     'n_estimators': 400,\n",
    "        #     'max_depth': 8,\n",
    "        #     'learning_rate': 0.003,\n",
    "        #     'min_child_samples': 25,\n",
    "        #     'subsample': 0.85,\n",
    "        #     'colsample_bytree': 0.85,\n",
    "        #     'reg_alpha': 0.2,\n",
    "        #     'reg_lambda': 1.5,\n",
    "        #     'num_leaves': 40,\n",
    "        #     'feature_fraction': 0.8,\n",
    "        #     'bagging_fraction': 0.8,\n",
    "        #     'bagging_freq': 5\n",
    "        # },\n",
    "        # 'Belgian': {\n",
    "        #     'n_estimators': 350,\n",
    "        #     'max_depth': 7,\n",
    "        #     'learning_rate': 0.004,\n",
    "        #     'min_child_samples': 20,\n",
    "        #     'subsample': 0.8,\n",
    "        #     'colsample_bytree': 0.8,\n",
    "        #     'reg_alpha': 0.15,\n",
    "        #     'reg_lambda': 1.2,\n",
    "        #     'num_leaves': 35,\n",
    "        #     'feature_fraction': 0.85,\n",
    "        #     'bagging_fraction': 0.85,\n",
    "        #     'bagging_freq': 4\n",
    "        # },\n",
    "        # 'Mexico': {\n",
    "        #     'n_estimators': 400,\n",
    "        #     'max_depth': 8,\n",
    "        #     'learning_rate': 0.003,\n",
    "        #     'min_child_samples': 25,\n",
    "        #     'subsample': 0.8,\n",
    "        #     'colsample_bytree': 0.8,\n",
    "        #     'reg_alpha': 0.25,\n",
    "        #     'reg_lambda': 1.8,\n",
    "        #     'num_leaves': 45,\n",
    "        #     'feature_fraction': 0.75,\n",
    "        #     'bagging_fraction': 0.75,\n",
    "        #     'bagging_freq': 6\n",
    "        # },\n",
    "        # 'United': {\n",
    "        #     'n_estimators': 350,\n",
    "        #     'max_depth': 7,\n",
    "        #     'learning_rate': 0.004,\n",
    "        #     'min_child_samples': 20,\n",
    "        #     'subsample': 0.8,\n",
    "        #     'colsample_bytree': 0.8,\n",
    "        #     'reg_alpha': 0.2,\n",
    "        #     'reg_lambda': 1.5,\n",
    "        #     'num_leaves': 38,\n",
    "        #     'feature_fraction': 0.8,\n",
    "        #     'bagging_fraction': 0.8,\n",
    "        #     'bagging_freq': 5\n",
    "        # }\n",
    "    }\n",
    "    \n",
    "    for event in df['Event'].unique():\n",
@@ -1486,7 +1349,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1537,7 +1400,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute modeling pipeline\n",
    "track_results = prepare_modeling_data(merged_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
@@ -1561,16 +1434,298 @@
      "Gradient Boosting  4.259  0.726  2.083\n",
      "LightGBM           3.663  0.806  1.839\n",
      "Random Forest      4.915  0.644  2.396\n",
-      "XGBoost            4.333  0.717  2.122\n"
+      "XGBoost            4.333  0.717  2.122\n",
      "Track: Bahrain Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 4.6234365858465685\n",
      "R²: 0.6564200639110124\n",
      "MAE: 2.290882533957278\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 4.482723061975936\n",
      "R²: 0.6770154094684407\n",
      "MAE: 2.29924114498376\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 3.8929079093506234\n",
      "R²: 0.7564174098342964\n",
      "MAE: 2.0801687790577947\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 4.430663019623761\n",
      "R²: 0.6844738017082579\n",
      "MAE: 2.2434135261926085\n",
      "\n",
      "\n",
      "Track: Austrian Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 4.231983001996377\n",
      "R²: 0.6315230167652944\n",
      "MAE: 1.8037092244342445\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 3.638262625437007\n",
      "R²: 0.7276605062041149\n",
      "MAE: 1.5160557512941613\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 3.2085157738706402\n",
      "R²: 0.7881975920851034\n",
      "MAE: 1.3781442667850283\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 3.6075605644058957\n",
      "R²: 0.7322374733886421\n",
      "MAE: 1.502905939005281\n",
      "\n",
      "\n",
      "Track: Hungarian Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 2.5335596356569674\n",
      "R²: 0.5171089086867844\n",
      "MAE: 1.160672366663474\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 2.455101907621795\n",
      "R²: 0.5465535791811402\n",
      "MAE: 1.103825270056087\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 1.6021632941275028\n",
      "R²: 0.8068919020455854\n",
      "MAE: 0.9263329831566685\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 2.4229222556465633\n",
      "R²: 0.558362554661797\n",
      "MAE: 1.0875929258632762\n",
      "\n",
      "\n",
      "Track: Italian Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 5.998999472404887\n",
      "R²: 0.5962048423983586\n",
      "MAE: 2.7633890861056942\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 5.593110923296684\n",
      "R²: 0.6489974078020129\n",
      "MAE: 2.53460153865636\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 4.74839367309371\n",
      "R²: 0.7470137635605553\n",
      "MAE: 1.989492196404826\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 5.494694856961057\n",
      "R²: 0.6612411752270557\n",
      "MAE: 2.487119952559893\n",
      "\n",
      "\n",
      "Track: Belgian Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 2.82430664259527\n",
      "R²: 0.691813998116644\n",
      "MAE: 1.5528536196862557\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 2.625038105313402\n",
      "R²: 0.7337678876875194\n",
      "MAE: 1.3691763226722693\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 2.0031574571367945\n",
      "R²: 0.8449686995436466\n",
      "MAE: 1.2390753487134425\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 2.5552156220375553\n",
      "R²: 0.7477423638235207\n",
      "MAE: 1.3303552562227956\n",
      "\n",
      "\n",
      "Track: Mexico City Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 4.581602699770516\n",
      "R²: 0.7989636810843874\n",
      "MAE: 2.447646886107032\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 4.235484309405982\n",
      "R²: 0.8281910333067097\n",
      "MAE: 2.27794335137729\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 3.2405925353794878\n",
      "R²: 0.8994253509007487\n",
      "MAE: 1.809661752663035\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 4.061311644595468\n",
      "R²: 0.8420308416971385\n",
      "MAE: 2.2083353448847975\n",
      "\n",
      "\n",
      "Track: Dutch Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 5.242245165574648\n",
      "R²: 0.6213123596709339\n",
      "MAE: 2.4352761691493408\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 4.339238736578863\n",
      "R²: 0.7405381175106279\n",
      "MAE: 1.974121811562873\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 3.592136758167162\n",
      "R²: 0.8221916302173067\n",
      "MAE: 1.6727451259978803\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 4.277023140315852\n",
      "R²: 0.7479250585546416\n",
      "MAE: 1.937106220056557\n",
      "\n",
      "\n",
      "Track: United States Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 3.914879341498217\n",
      "R²: 0.6624378004281326\n",
      "MAE: 1.9716752813532386\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 3.316411051013265\n",
      "R²: 0.7577555964658077\n",
      "MAE: 1.7003761926987593\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 3.0797295392489423\n",
      "R²: 0.7910981902207407\n",
      "MAE: 1.560454661178369\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 3.248990517657715\n",
      "R²: 0.7675048311230572\n",
      "MAE: 1.6766273725189562\n",
      "\n",
      "\n",
      "Track: British Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 6.224674007048496\n",
      "R²: 0.7040484018401661\n",
      "MAE: 2.9628109220454757\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 5.62291328677216\n",
      "R²: 0.7585038352530639\n",
      "MAE: 2.6943669301228352\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 4.893601769901292\n",
      "R²: 0.8170869401437746\n",
      "MAE: 2.4294869493658147\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 5.492286957459624\n",
      "R²: 0.7695939357041066\n",
      "MAE: 2.613212445632392\n",
      "\n",
      "\n",
      "Track: Saudi Arabian Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 8.120691777901165\n",
      "R²: 0.4578286872779349\n",
      "MAE: 4.039953681061913\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 6.65320459822338\n",
      "R²: 0.6360747086514538\n",
      "MAE: 3.274169052844664\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 5.855792569073277\n",
      "R²: 0.7180826211908908\n",
      "MAE: 2.835742073310954\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 6.672457444168485\n",
      "R²: 0.6339654287673655\n",
      "MAE: 3.2976268715753827\n",
      "\n",
      "\n",
      "Track: Sao Paulo Grand Prix\n",
      "Model: Random Forest\n",
      "RMSE: 5.7693406331780945\n",
      "R²: 0.7506457092594622\n",
      "MAE: 2.925633957041751\n",
      "\n",
      "\n",
      "Model: XGBoost\n",
      "RMSE: 4.702681090626618\n",
      "R²: 0.8343255491465327\n",
      "MAE: 2.598996936017178\n",
      "\n",
      "\n",
      "Model: LightGBM\n",
      "RMSE: 4.175640904129102\n",
      "R²: 0.8693796721301632\n",
      "MAE: 2.305109736740256\n",
      "\n",
      "\n",
      "Model: Gradient Boosting\n",
      "RMSE: 4.585681379219731\n",
      "R²: 0.8424667492339786\n",
      "MAE: 2.5341300309961023\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Execute modeling pipeline\n",
    "track_results = prepare_modeling_data(merged_data)\n",
    "\n",
    "# Visualize results\n",
-    "plot_model_performance(track_results)"
+    "plot_model_performance(track_results)\n",
    "\n",
    "# Print out each model's performance for each track\n",
    "for track, models in track_results.items():\n",
    "    print(f\"Track: {track}\")\n",
    "    for model_name, metrics in models.items():\n",
    "        print(f\"Model: {model_name}\")\n",
    "        print(f\"RMSE: {metrics['rmse']}\")\n",
    "        print(f\"R²: {metrics['r2']}\")\n",
    "        print(f\"MAE: {metrics['mae']}\")\n",
    "        print(\"\\n\")"
   ]
  },
  {