{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Formula One Project: Modeling\n", "\n", "DUE: December 4th, 2024 (Wed) \n", "Name(s): Sean O'Connor, Connor Coles \n", "Class: CSCI 349 - Intro to Data Mining \n", "Semester: Fall 2024 \n", "Instructor: Brian King " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Assignment Description\n", "\n", "Copy over the important cells from the previous step that read in and cleaned your data to this new notebook file. You do not need to copy over all your EDA and plots describing your data, only the code that prepares your data for modeling. This notebook is about exploring the development of predictive models. Some initial preliminary work on applying some modeling techniques should be completed.\n", "Be sure to commit and push all supporting code that you've completed in this file. Include in this notebook a summary cell at the top that details your accomplishments, challenges, and what you expect to accomplish for your final steps. Be sure to update your readme.md in your repository." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Importing Libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import os\n", "\n", "import fastf1\n", "import fastf1.plotting\n", "from fastf1.ergast.structure import FastestLap\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "from sklearn.svm import SVR\n", "import xgboost as xgb\n", "from sklearn.model_selection import cross_val_score" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']\n" ] } ], "source": [ "# FastF1 general setup\n", "cache_dir = '../data/cache'\n", "if not os.path.exists(cache_dir):\n", " os.makedirs(cache_dir)\n", "\n", "fastf1.Cache.enable_cache(cache_dir)\n", "fastf1.plotting.setup_mpl(misc_mpl_mods=False, color_scheme=None)\n", "\n", "# Set up plot style\n", "# print style.available to check available styles\n", "print(plt.style.available)\n", "plt.style.use('seaborn-v0_8-whitegrid')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Define years, sessions, and events of interest\n", "years = [2021, 2022, 2023, 2024]\n", "sessions = ['Race']\n", "events = ['Bahrain Grand Prix', 'British Grand Prix', 'United States Grand Prix', 'Mexico City Grand Prix', 'São Paulo Grand Prix'] " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing 2021 Bahrain Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "core INFO \tLoading data for Bahrain Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n", "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['44', '33', '77', '4', '11', '16', '3', '55', '22', '18', '7', '99', '31', '63', '5', '47', '10', '6', '14', '9']\n", "core INFO \tLoading data for British Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2021 British Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['44', '16', '77', '4', '3', '55', '14', '18', '31', '22', '10', '63', '99', '6', '7', '11', '9', '47', '5', '33']\n", "core INFO \tLoading data for United States Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2021 United States Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "core WARNING \tDriver 7: Lap timing integrity check failed for 1 lap(s)\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['33', '44', '11', '16', '3', '77', '55', '4', '22', '5', '99', '18', '7', '63', '6', '47', '9', '14', '31', '10']\n", "core INFO \tLoading data for Mexico City Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2021 Mexico City Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['33', '44', '11', '10', '16', '55', '5', '7', '14', '4', '99', '3', '31', '18', '77', '63', '6', '9', '47', '22']\n", "core INFO \tLoading data for São Paulo Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2021 São Paulo Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['44', '33', '77', '11', '16', '55', '10', '31', '14', '4', '5', '7', '63', '99', '22', '6', '9', '47', '3', '18']\n", "core INFO \tLoading data for Bahrain Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2022 Bahrain Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']\n", "core INFO \tLoading data for British Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2022 British Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['55', '11', '44', '16', '14', '4', '1', '47', '5', '20', '18', '6', '3', '22', '31', '10', '77', '63', '24', '23']\n", "events WARNING \tCorrecting user input 'United States Grand Prix' to 'United States Grand Prix'\n", "core INFO \tLoading data for United States Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2022 United States Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '44', '16', '11', '63', '4', '14', '5', '20', '22', '31', '24', '23', '10', '47', '3', '6', '18', '77', '55']\n", "core INFO \tLoading data for Mexico City Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2022 Mexico City Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '44', '11', '63', '55', '16', '3', '31', '4', '77', '10', '23', '24', '5', '18', '47', '20', '6', '14', '22']\n", "core INFO \tLoading data for São Paulo Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2022 São Paulo Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['63', '44', '55', '16', '14', '1', '11', '31', '77', '18', '5', '24', '47', '10', '23', '6', '22', '4', '20', '3']\n", "core INFO \tLoading data for Bahrain Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2023 Bahrain Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']\n", "core INFO \tLoading data for British Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2023 British Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '4', '44', '81', '63', '11', '14', '23', '16', '55', '2', '77', '27', '18', '24', '22', '21', '10', '20', '31']\n", "events WARNING \tCorrecting user input 'United States Grand Prix' to 'United States Grand Prix'\n", "core INFO \tLoading data for United States Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2023 United States Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '4', '55', '11', '63', '10', '18', '22', '23', '2', '27', '77', '24', '20', '3', '14', '81', '31', '44', '16']\n", "core INFO \tLoading data for Mexico City Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2023 Mexico City Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '44', '16', '55', '4', '63', '3', '81', '23', '31', '10', '22', '27', '24', '77', '2', '18', '14', '20', '11']\n", "core INFO \tLoading data for São Paulo Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2023 São Paulo Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '4', '14', '11', '18', '55', '10', '44', '22', '31', '2', '27', '3', '81', '63', '77', '24', '20', '23', '16']\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2024 Bahrain Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "core INFO \tLoading data for Bahrain Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n", "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']\n", "core INFO \tLoading data for British Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2024 British Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['44', '1', '4', '81', '55', '27', '18', '14', '23', '22', '2', '20', '3', '16', '77', '31', '11', '24', '63', '10']\n", "events WARNING \tCorrecting user input 'United States Grand Prix' to 'United States Grand Prix'\n", "core INFO \tLoading data for United States Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2024 United States Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['16', '55', '1', '4', '81', '63', '11', '27', '30', '43', '20', '10', '14', '22', '18', '23', '77', '31', '24', '44']\n", "core INFO \tLoading data for Mexico City Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2024 Mexico City Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['55', '4', '16', '44', '63', '1', '20', '81', '27', '10', '18', '43', '31', '77', '24', '30', '11', '14', '23', '22']\n", "core INFO \tLoading data for São Paulo Grand Prix - Race [v3.4.4]\n", "req INFO \tUsing cached data for session_info\n", "req INFO \tUsing cached data for driver_info\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing 2024 São Paulo Grand Prix - Race\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "req INFO \tUsing cached data for session_status_data\n", "req INFO \tUsing cached data for lap_count\n", "req INFO \tUsing cached data for track_status_data\n", "req INFO \tUsing cached data for _extended_timing_data\n", "req INFO \tUsing cached data for timing_app_data\n", "core INFO \tProcessing timing data...\n", "core WARNING \tNo lap data for driver 23\n", "core WARNING \tFailed to perform lap accuracy check - all laps marked as inaccurate (driver 23)\n", "req INFO \tUsing cached data for car_data\n", "req INFO \tUsing cached data for position_data\n", "req INFO \tUsing cached data for weather_data\n", "req INFO \tUsing cached data for race_control_messages\n", "core INFO \tFinished loading data for 20 drivers: ['1', '31', '10', '63', '16', '4', '22', '81', '30', '44', '11', '50', '77', '14', '24', '55', '43', '23', '18', '27']\n" ] } ], "source": [ "# Get data from FastF1 API\n", "\n", "# Data containers\n", "weather_data_list = []\n", "lap_data_list = []\n", "\n", "# Loop through years and sessions\n", "for year in years:\n", " for event_name in events: \n", " for session_name in sessions:\n", " try:\n", " print(f\"Processing {year} {event_name} - {session_name}\")\n", " \n", " # Load the session\n", " session = fastf1.get_session(year, event_name, session_name, backend='fastf1')\n", " session.load()\n", " \n", " # Process weather data\n", " weather_data = session.weather_data\n", " if weather_data is not None:\n", " weather_df = pd.DataFrame(weather_data)\n", " # Add context columns\n", " weather_df['Year'] = year\n", " weather_df['Event'] = event_name\n", " weather_df['Session'] = session_name\n", " weather_data_list.append(weather_df)\n", "\n", " # Process lap data\n", " lap_data = session.laps\n", " if lap_data is not None:\n", " lap_df = pd.DataFrame(lap_data)\n", " # Add context columns\n", " lap_df['Year'] = year\n", " lap_df['Event'] = event_name\n", " lap_df['Session'] = session_name\n", " # Ensure driver information is included\n", " if 'Driver' not in lap_df.columns:\n", " lap_df['Driver'] = lap_df['DriverNumber'].map(session.drivers)\n", " # Add team information if available\n", " if 'Team' not in lap_df.columns:\n", " lap_df['Team'] = lap_df['Driver'].map(session.drivers_info['TeamName'])\n", " lap_data_list.append(lap_df)\n", " \n", " except Exception as e:\n", " print(f\"Error with {event_name} {session_name} ({year}): {e}\")\n", "\n", "# Combine data into DataFrames\n", "if weather_data_list:\n", " weather_data_combined = pd.concat(weather_data_list, ignore_index=True)\n", " # Ensure consistent column ordering\n", " weather_cols = ['Time', 'Year', 'Event', 'Session', \n", " 'AirTemp', 'Humidity', 'Pressure', 'Rainfall', \n", " 'TrackTemp', 'WindDirection', 'WindSpeed']\n", " weather_data_combined = weather_data_combined[weather_cols]\n", " \n", "if lap_data_list:\n", " lap_data_combined = pd.concat(lap_data_list, ignore_index=True)\n", " # Ensure consistent column ordering\n", " lap_cols = ['Time', 'Year', 'Event', 'Session', \n", " 'Driver', 'Team', 'LapNumber', 'LapTime',\n", " 'Sector1Time', 'Sector2Time', 'Sector3Time',\n", " 'Compound', 'TyreLife', 'FreshTyre',\n", " 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']\n", " # Only include columns that exist\n", " existing_cols = [col for col in lap_cols if col in lap_data_combined.columns]\n", " lap_data_combined = lap_data_combined[existing_cols]\n", " \n", "# Time conversion\n", "# Function to convert timedelta to datetime\n", "def convert_timedelta_to_datetime(df, base_date='2021-01-01'):\n", " if 'Time' in df.columns:\n", " # Create a base datetime and add the timedelta\n", " base = pd.Timestamp(base_date)\n", " if df['Time'].dtype == 'timedelta64[ns]':\n", " df['Time'] = base + df['Time']\n", " return df\n", "\n", "# Apply conversion to both dataframes\n", "weather_data_combined = convert_timedelta_to_datetime(weather_data_combined)\n", "lap_data_combined = convert_timedelta_to_datetime(lap_data_combined)\n", "\n", "# Remove missing values\n", "weather_data_combined = weather_data_combined.dropna()\n", "lap_data_combined = lap_data_combined.dropna()\n", "\n", "# Create a new column for lap time in seconds\n", "lap_data_combined['LapTime_seconds'] = lap_data_combined['LapTime'].dt.total_seconds()\n", "\n", "# Merge the data\n", "merged_data = pd.merge_asof(\n", " lap_data_combined.sort_values('Time'),\n", " weather_data_combined.sort_values('Time'),\n", " on='Time',\n", " by=['Event', 'Year'], # Match within same event and year\n", " direction='nearest',\n", " tolerance=pd.Timedelta('1 min') # Allow matching within 1 minute\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Feature Engineering\n", "def engineer_features(df):\n", " # Normalize lap times per track\n", " df['NormalizedLapTime'] = df.groupby('Event')['LapTime_seconds'].transform(\n", " lambda x: (x - x.mean()) / x.mean()\n", " )\n", " \n", " # Driver performance metrics (now track-specific)\n", " df['DriverTrackAvg'] = df.groupby(['Driver', 'Event'])['LapTime_seconds'].transform('mean')\n", " df['DriverTrackStd'] = df.groupby(['Driver', 'Event'])['LapTime_seconds'].transform('std')\n", " \n", " # Calculate driver's performance relative to track average\n", " df['DriverTrackPerformance'] = df.groupby(['Event', 'Year'])['LapTime_seconds'].transform(\n", " lambda x: (x - x.mean()) / x.mean()\n", " )\n", " \n", " # Tire performance degradation (exponential, track-specific)\n", " # Different tracks have different tire wear characteristics\n", " df['TyreAgeFactor'] = df.groupby('Event')['TyreLife'].transform(\n", " lambda x: np.exp(-0.02 * x) # Could vary coefficient by track type\n", " )\n", " \n", " # Track evolution (grip improvement, track-specific)\n", " df['TrackEvolution'] = df.groupby(['Event', 'Year'])['LapNumber'].transform(\n", " lambda x: (x - x.min()) / (x.max() - x.min())\n", " )\n", " \n", " # Weather impact (track-specific)\n", " df['TempDelta'] = df['TrackTemp'] - df['AirTemp']\n", " \n", " # Fuel effect (track-specific due to different fuel consumption rates)\n", " df['FuelEffect'] = df.groupby('Event')['LapNumber'].transform(\n", " lambda x: 1 - (x / x.max())\n", " )\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Prepare data for modeling\n", "def prepare_modeling_data(df):\n", " # Engineer features\n", " data = engineer_features(df)\n", " \n", " # Train separate models for each track\n", " track_models = {}\n", " track_results = {}\n", " \n", " for event in data['Event'].unique():\n", " print(f\"\\nProcessing {event}\")\n", " track_data = data[data['Event'] == event].copy()\n", " \n", " # Select features for modeling\n", " feature_columns = [\n", " 'DriverTrackAvg', 'DriverTrackStd',\n", " 'DriverTrackPerformance',\n", " 'TrackTemp', 'AirTemp', 'Humidity', 'WindSpeed',\n", " 'TyreLife', 'TyreAgeFactor', 'TrackEvolution',\n", " 'TempDelta', 'FuelEffect', 'SpeedI1', 'SpeedI2'\n", " ]\n", " \n", " # Create dummy variables for categorical features\n", " track_data = pd.get_dummies(track_data, columns=['Compound'])\n", " feature_columns.extend([col for col in track_data.columns if col.startswith('Compound_')])\n", " \n", " # Split and scale data for this track\n", " X = track_data[feature_columns]\n", " y = track_data['LapTime_seconds']\n", " \n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", " )\n", " \n", " scaler = StandardScaler()\n", " X_train_scaled = scaler.fit_transform(X_train)\n", " X_test_scaled = scaler.transform(X_test)\n", " \n", " # Train models for this track\n", " models = {\n", " 'Linear Regression': LinearRegression(),\n", " 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),\n", " 'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),\n", " 'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)\n", " }\n", " \n", " track_results[event] = {}\n", " for name, model in models.items():\n", " # Train model\n", " model.fit(X_train_scaled, y_train)\n", " \n", " # Make predictions\n", " y_pred = model.predict(X_test_scaled)\n", " \n", " # Calculate metrics\n", " mse = mean_squared_error(y_test, y_pred)\n", " rmse = np.sqrt(mse)\n", " r2 = r2_score(y_test, y_pred)\n", " \n", " track_results[event][name] = {\n", " 'RMSE': rmse,\n", " 'R2': r2,\n", " 'model': model,\n", " 'scaler': scaler,\n", " 'features': feature_columns\n", " }\n", " \n", " # Print results for this track\n", " print(f\"\\nResults for {event}:\")\n", " for name, metrics in track_results[event].items():\n", " print(f\"{name}:\")\n", " print(f\"RMSE: {metrics['RMSE']:.2f} seconds\")\n", " print(f\"R2 Score: {metrics['R2']:.3f}\")\n", " \n", " return track_results" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Processing Bahrain Grand Prix\n", "\n", "Results for Bahrain Grand Prix:\n", "Linear Regression:\n", "RMSE: 1.10 seconds\n", "R2 Score: 0.981\n", "Random Forest:\n", "RMSE: 0.30 seconds\n", "R2 Score: 0.999\n", "XGBoost:\n", "RMSE: 0.32 seconds\n", "R2 Score: 0.998\n", "Gradient Boosting:\n", "RMSE: 0.42 seconds\n", "R2 Score: 0.997\n", "\n", "Processing Mexico City Grand Prix\n", "\n", "Results for Mexico City Grand Prix:\n", "Linear Regression:\n", "RMSE: 0.50 seconds\n", "R2 Score: 0.998\n", "Random Forest:\n", "RMSE: 0.28 seconds\n", "R2 Score: 0.999\n", "XGBoost:\n", "RMSE: 0.39 seconds\n", "R2 Score: 0.999\n", "Gradient Boosting:\n", "RMSE: 0.25 seconds\n", "R2 Score: 0.999\n", "\n", "Processing United States Grand Prix\n", "\n", "Results for United States Grand Prix:\n", "Linear Regression:\n", "RMSE: 0.40 seconds\n", "R2 Score: 0.996\n", "Random Forest:\n", "RMSE: 0.39 seconds\n", "R2 Score: 0.997\n", "XGBoost:\n", "RMSE: 0.47 seconds\n", "R2 Score: 0.995\n", "Gradient Boosting:\n", "RMSE: 0.30 seconds\n", "R2 Score: 0.998\n", "\n", "Processing British Grand Prix\n", "\n", "Results for British Grand Prix:\n", "Linear Regression:\n", "RMSE: 0.73 seconds\n", "R2 Score: 0.996\n", "Random Forest:\n", "RMSE: 0.56 seconds\n", "R2 Score: 0.998\n", "XGBoost:\n", "RMSE: 0.46 seconds\n", "R2 Score: 0.998\n", "Gradient Boosting:\n", "RMSE: 0.50 seconds\n", "R2 Score: 0.998\n", "\n", "Processing São Paulo Grand Prix\n", "\n", "Results for São Paulo Grand Prix:\n", "Linear Regression:\n", "RMSE: 0.81 seconds\n", "R2 Score: 0.995\n", "Random Forest:\n", "RMSE: 0.52 seconds\n", "R2 Score: 0.998\n", "XGBoost:\n", "RMSE: 0.56 seconds\n", "R2 Score: 0.998\n", "Gradient Boosting:\n", "RMSE: 0.59 seconds\n", "R2 Score: 0.997\n" ] } ], "source": [ "# Execute modeling pipeline\n", "track_results = prepare_modeling_data(merged_data)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Visualize results across tracks\n", "def plot_rmse(track_results):\n", " # Prepare data for plotting\n", " comparison_data = []\n", " for track, models in track_results.items():\n", " for model_name, metrics in models.items():\n", " comparison_data.append({\n", " 'Track': track,\n", " 'Model': model_name,\n", " 'RMSE': metrics['RMSE'],\n", " })\n", " \n", " comparison_df = pd.DataFrame(comparison_data)\n", " \n", " # Plot RMSE comparison\n", " plt.figure(figsize=(15, 6))\n", " sns.barplot(data=comparison_df, x='Track', y='RMSE', hue='Model')\n", " plt.title('Model Performance (RMSE) Across Different Tracks')\n", " plt.xticks(rotation=45)\n", " plt.tight_layout()\n", " plt.show()\n", "\n", "def plot_r2(track_results):\n", " # Prepare data for plotting\n", " comparison_data = []\n", " for track, models in track_results.items():\n", " for model_name, metrics in models.items():\n", " comparison_data.append({\n", " 'Track': track,\n", " 'Model': model_name,\n", " 'R2': metrics['R2'],\n", " })\n", " \n", " comparison_df = pd.DataFrame(comparison_data)\n", " \n", " # Plot R² comparison\n", " plt.figure(figsize=(15, 6))\n", " sns.barplot(data=comparison_df, x='Track', y='R2', hue='Model')\n", " plt.title('Model Performance (R²) Across Different Tracks')\n", " plt.xticks(rotation=45)\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Plot RMSE comparison across tracks\n", "plot_rmse(track_results)\n", "\n", "# Plot R2 comparison across tracks\n", "plot_r2(track_results)" ] } ], "metadata": { "kernelspec": { "display_name": "csci349", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }