diff --git a/project/Modeling.ipynb b/project/Modeling.ipynb index e858982..21cda73 100644 --- a/project/Modeling.ipynb +++ b/project/Modeling.ipynb @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -63,28 +63,19 @@ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import xgboost as xgb\n", - "from catboost import CatBoostRegressor\n", - "from fastf1.ergast.structure import FastestLap\n", "from lightgbm import LGBMRegressor\n", - "from sklearn.compose import ColumnTransformer\n", "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n", "from sklearn.impute import SimpleImputer\n", - "from sklearn.linear_model import Lasso, LinearRegression, Ridge\n", - "from sklearn.metrics import (make_scorer, mean_absolute_error,\n", - " mean_squared_error, r2_score)\n", - "from sklearn.model_selection import (cross_val_score, cross_validate,\n", - " train_test_split)\n", - "from sklearn.pipeline import Pipeline, make_pipeline\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.svm import SVR\n", - "from sklearn.tree import DecisionTreeRegressor\n", "from xgboost import XGBRegressor" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -816,7 +807,7 @@ "24 2024-12-08 17:00:00+04:00 2024-12-08 13:00:00 True \n", "\n", "[25 rows x 23 columns]\n", - "{'Session2DateUtc', 'Session2Date', 'OfficialEventName', 'Session2', 'EventName', 'RoundNumber', 'Session5DateUtc', 'Session1DateUtc', 'Session4DateUtc', 'Session4Date', 'Session3Date', 'F1ApiSupport', 'Country', 'Session5', 'EventFormat', 'Session3DateUtc', 'Session4', 'Session1', 'Session1Date', 'Session3', 'EventDate', 'Location', 'Session5Date'}\n" + "{'RoundNumber', 'Location', 'Session5Date', 'Session3', 'EventName', 'OfficialEventName', 'Session5', 'Session2DateUtc', 'Session2Date', 'Session5DateUtc', 'Session1', 'Session2', 'Session1DateUtc', 'Session4Date', 'Session3DateUtc', 'EventDate', 'F1ApiSupport', 'Session1Date', 'Country', 'Session3Date', 'Session4', 'EventFormat', 'Session4DateUtc'}\n" ] } ], @@ -848,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -860,7 +851,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -922,22 +913,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing 2022 Dutch Grand Prix - Race\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "_api WARNING \tDriver 241: Position data is incomplete!\n", - "_api WARNING \tDriver 242: Position data is incomplete!\n", - "_api WARNING \tDriver 243: Position data is incomplete!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "Processing 2022 Dutch Grand Prix - Race\n", "Processing 2022 Italian Grand Prix - Race\n", "Processing 2022 Austrian Grand Prix - Race\n", "Processing 2022 Hungarian Grand Prix - Race\n", @@ -973,54 +949,11 @@ "text": [ "Processing 2022 Sao Paulo Grand Prix - Race\n", "Processing 2023 Bahrain Grand Prix - Race\n", - "Processing 2023 Saudi Arabian Grand Prix - Race\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "_api WARNING \tDriver 241: Position data is incomplete!\n", - "_api WARNING \tDriver 242: Position data is incomplete!\n", - "_api WARNING \tDriver 243: Position data is incomplete!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing 2023 Dutch Grand Prix - Race\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "_api WARNING \tDriver 241: Position data is incomplete!\n", - "_api WARNING \tDriver 242: Position data is incomplete!\n", - "_api WARNING \tDriver 243: Position data is incomplete!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "Processing 2023 Saudi Arabian Grand Prix - Race\n", + "Processing 2023 Dutch Grand Prix - Race\n", "Processing 2023 Italian Grand Prix - Race\n", "Processing 2023 Austrian Grand Prix - Race\n", - "Processing 2023 Hungarian Grand Prix - Race\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "_api WARNING \tSkipping lap alignment (no suitable lap)!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "Processing 2023 Hungarian Grand Prix - Race\n", "Processing 2023 British Grand Prix - Race\n", "Processing 2023 Belgian Grand Prix - Race\n" ] @@ -1202,7 +1135,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -1289,7 +1222,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -1323,77 +1256,7 @@ " 'feature_fraction': 0.8,\n", " 'bagging_fraction': 0.8,\n", " 'bagging_freq': 5\n", - " },\n", - " # 'British': {\n", - " # 'n_estimators': 500,\n", - " # 'max_depth': 8,\n", - " # 'learning_rate': 0.002,\n", - " # 'min_child_samples': 30,\n", - " # 'subsample': 0.75,\n", - " # 'colsample_bytree': 0.75,\n", - " # 'reg_alpha': 0.3,\n", - " # 'reg_lambda': 2.0,\n", - " # 'num_leaves': 30,\n", - " # 'feature_fraction': 0.7,\n", - " # 'bagging_fraction': 0.7,\n", - " # 'bagging_freq': 7\n", - " # },\n", - " # 'Bahrain': {\n", - " # 'n_estimators': 400,\n", - " # 'max_depth': 8,\n", - " # 'learning_rate': 0.003,\n", - " # 'min_child_samples': 25,\n", - " # 'subsample': 0.85,\n", - " # 'colsample_bytree': 0.85,\n", - " # 'reg_alpha': 0.2,\n", - " # 'reg_lambda': 1.5,\n", - " # 'num_leaves': 40,\n", - " # 'feature_fraction': 0.8,\n", - " # 'bagging_fraction': 0.8,\n", - " # 'bagging_freq': 5\n", - " # },\n", - " # 'Belgian': {\n", - " # 'n_estimators': 350,\n", - " # 'max_depth': 7,\n", - " # 'learning_rate': 0.004,\n", - " # 'min_child_samples': 20,\n", - " # 'subsample': 0.8,\n", - " # 'colsample_bytree': 0.8,\n", - " # 'reg_alpha': 0.15,\n", - " # 'reg_lambda': 1.2,\n", - " # 'num_leaves': 35,\n", - " # 'feature_fraction': 0.85,\n", - " # 'bagging_fraction': 0.85,\n", - " # 'bagging_freq': 4\n", - " # },\n", - " # 'Mexico': {\n", - " # 'n_estimators': 400,\n", - " # 'max_depth': 8,\n", - " # 'learning_rate': 0.003,\n", - " # 'min_child_samples': 25,\n", - " # 'subsample': 0.8,\n", - " # 'colsample_bytree': 0.8,\n", - " # 'reg_alpha': 0.25,\n", - " # 'reg_lambda': 1.8,\n", - " # 'num_leaves': 45,\n", - " # 'feature_fraction': 0.75,\n", - " # 'bagging_fraction': 0.75,\n", - " # 'bagging_freq': 6\n", - " # },\n", - " # 'United': {\n", - " # 'n_estimators': 350,\n", - " # 'max_depth': 7,\n", - " # 'learning_rate': 0.004,\n", - " # 'min_child_samples': 20,\n", - " # 'subsample': 0.8,\n", - " # 'colsample_bytree': 0.8,\n", - " # 'reg_alpha': 0.2,\n", - " # 'reg_lambda': 1.5,\n", - " # 'num_leaves': 38,\n", - " # 'feature_fraction': 0.8,\n", - " # 'bagging_fraction': 0.8,\n", - " # 'bagging_freq': 5\n", - " # }\n", + " }\n", " }\n", " \n", " for event in df['Event'].unique():\n", @@ -1486,7 +1349,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1537,7 +1400,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Execute modeling pipeline\n", + "track_results = prepare_modeling_data(merged_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1561,16 +1434,298 @@ "Gradient Boosting 4.259 0.726 2.083\n", "LightGBM 3.663 0.806 1.839\n", "Random Forest 4.915 0.644 2.396\n", - "XGBoost 4.333 0.717 2.122\n" + "XGBoost 4.333 0.717 2.122\n", + "Track: Bahrain Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 4.6234365858465685\n", + "R²: 0.6564200639110124\n", + "MAE: 2.290882533957278\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 4.482723061975936\n", + "R²: 0.6770154094684407\n", + "MAE: 2.29924114498376\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 3.8929079093506234\n", + "R²: 0.7564174098342964\n", + "MAE: 2.0801687790577947\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 4.430663019623761\n", + "R²: 0.6844738017082579\n", + "MAE: 2.2434135261926085\n", + "\n", + "\n", + "Track: Austrian Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 4.231983001996377\n", + "R²: 0.6315230167652944\n", + "MAE: 1.8037092244342445\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 3.638262625437007\n", + "R²: 0.7276605062041149\n", + "MAE: 1.5160557512941613\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 3.2085157738706402\n", + "R²: 0.7881975920851034\n", + "MAE: 1.3781442667850283\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 3.6075605644058957\n", + "R²: 0.7322374733886421\n", + "MAE: 1.502905939005281\n", + "\n", + "\n", + "Track: Hungarian Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 2.5335596356569674\n", + "R²: 0.5171089086867844\n", + "MAE: 1.160672366663474\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 2.455101907621795\n", + "R²: 0.5465535791811402\n", + "MAE: 1.103825270056087\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 1.6021632941275028\n", + "R²: 0.8068919020455854\n", + "MAE: 0.9263329831566685\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 2.4229222556465633\n", + "R²: 0.558362554661797\n", + "MAE: 1.0875929258632762\n", + "\n", + "\n", + "Track: Italian Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 5.998999472404887\n", + "R²: 0.5962048423983586\n", + "MAE: 2.7633890861056942\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 5.593110923296684\n", + "R²: 0.6489974078020129\n", + "MAE: 2.53460153865636\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 4.74839367309371\n", + "R²: 0.7470137635605553\n", + "MAE: 1.989492196404826\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 5.494694856961057\n", + "R²: 0.6612411752270557\n", + "MAE: 2.487119952559893\n", + "\n", + "\n", + "Track: Belgian Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 2.82430664259527\n", + "R²: 0.691813998116644\n", + "MAE: 1.5528536196862557\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 2.625038105313402\n", + "R²: 0.7337678876875194\n", + "MAE: 1.3691763226722693\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 2.0031574571367945\n", + "R²: 0.8449686995436466\n", + "MAE: 1.2390753487134425\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 2.5552156220375553\n", + "R²: 0.7477423638235207\n", + "MAE: 1.3303552562227956\n", + "\n", + "\n", + "Track: Mexico City Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 4.581602699770516\n", + "R²: 0.7989636810843874\n", + "MAE: 2.447646886107032\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 4.235484309405982\n", + "R²: 0.8281910333067097\n", + "MAE: 2.27794335137729\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 3.2405925353794878\n", + "R²: 0.8994253509007487\n", + "MAE: 1.809661752663035\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 4.061311644595468\n", + "R²: 0.8420308416971385\n", + "MAE: 2.2083353448847975\n", + "\n", + "\n", + "Track: Dutch Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 5.242245165574648\n", + "R²: 0.6213123596709339\n", + "MAE: 2.4352761691493408\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 4.339238736578863\n", + "R²: 0.7405381175106279\n", + "MAE: 1.974121811562873\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 3.592136758167162\n", + "R²: 0.8221916302173067\n", + "MAE: 1.6727451259978803\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 4.277023140315852\n", + "R²: 0.7479250585546416\n", + "MAE: 1.937106220056557\n", + "\n", + "\n", + "Track: United States Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 3.914879341498217\n", + "R²: 0.6624378004281326\n", + "MAE: 1.9716752813532386\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 3.316411051013265\n", + "R²: 0.7577555964658077\n", + "MAE: 1.7003761926987593\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 3.0797295392489423\n", + "R²: 0.7910981902207407\n", + "MAE: 1.560454661178369\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 3.248990517657715\n", + "R²: 0.7675048311230572\n", + "MAE: 1.6766273725189562\n", + "\n", + "\n", + "Track: British Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 6.224674007048496\n", + "R²: 0.7040484018401661\n", + "MAE: 2.9628109220454757\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 5.62291328677216\n", + "R²: 0.7585038352530639\n", + "MAE: 2.6943669301228352\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 4.893601769901292\n", + "R²: 0.8170869401437746\n", + "MAE: 2.4294869493658147\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 5.492286957459624\n", + "R²: 0.7695939357041066\n", + "MAE: 2.613212445632392\n", + "\n", + "\n", + "Track: Saudi Arabian Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 8.120691777901165\n", + "R²: 0.4578286872779349\n", + "MAE: 4.039953681061913\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 6.65320459822338\n", + "R²: 0.6360747086514538\n", + "MAE: 3.274169052844664\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 5.855792569073277\n", + "R²: 0.7180826211908908\n", + "MAE: 2.835742073310954\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 6.672457444168485\n", + "R²: 0.6339654287673655\n", + "MAE: 3.2976268715753827\n", + "\n", + "\n", + "Track: Sao Paulo Grand Prix\n", + "Model: Random Forest\n", + "RMSE: 5.7693406331780945\n", + "R²: 0.7506457092594622\n", + "MAE: 2.925633957041751\n", + "\n", + "\n", + "Model: XGBoost\n", + "RMSE: 4.702681090626618\n", + "R²: 0.8343255491465327\n", + "MAE: 2.598996936017178\n", + "\n", + "\n", + "Model: LightGBM\n", + "RMSE: 4.175640904129102\n", + "R²: 0.8693796721301632\n", + "MAE: 2.305109736740256\n", + "\n", + "\n", + "Model: Gradient Boosting\n", + "RMSE: 4.585681379219731\n", + "R²: 0.8424667492339786\n", + "MAE: 2.5341300309961023\n", + "\n", + "\n" ] } ], "source": [ - "# Execute modeling pipeline\n", - "track_results = prepare_modeling_data(merged_data)\n", - "\n", "# Visualize results\n", - "plot_model_performance(track_results)" + "plot_model_performance(track_results)\n", + "\n", + "# Print out each model's performance for each track\n", + "for track, models in track_results.items():\n", + " print(f\"Track: {track}\")\n", + " for model_name, metrics in models.items():\n", + " print(f\"Model: {model_name}\")\n", + " print(f\"RMSE: {metrics['rmse']}\")\n", + " print(f\"R²: {metrics['r2']}\")\n", + " print(f\"MAE: {metrics['mae']}\")\n", + " print(\"\\n\")" ] }, {