bumps/bumps_import_db.py at main · tavelli/bumps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
from supabase import create_client, Client

import os
from dotenv import load_dotenv

# Specify the path to your .env.local file
dotenv_path = '.env.local'

# Load the environment variables from the specified file
load_dotenv(dotenv_path=dotenv_path)

# --- CONFIGURATION ---
TRIAL_MODE = False
SUPABASE_URL = "https://odwyxdiizeyeznymzwds.supabase.co"
SUPABASE_KEY = os.getenv('SUPBASE_SECRETY_KEY')

print(f"Using Supabase Key: {SUPABASE_KEY[:8]}...")

# --- EVENT NAME MAPPING ---
EVENT_NAME_MAP = {
    "Washington": "Mt. Washington",
    "Ascutney": "Mt. Ascutney",
    "Greylock": "Mt. Greylock",
    "Whiteface": "Whiteface Mountain",
    "High Point": "High Point",
    "Kearsarge": "Mt. Kearsarge",
    "Prospect": "Prospect Mountain",
    "Allen": "Allen Clark",
    "Equinox": "Mt. Equinox",
    "Okemo": "Mt. Okemo",
    "Newton": "Newton's Revenge",
    "Kanc": "Crank the Kanc",
    "Wachusett": "Mt. Wachusett",
    "Newton": "Newton's Revenge",
    "Bird":"Mt. Washington Early Bird"
}

def parse_to_interval(raw_time):
    if not raw_time:
        return None

    # 1. Clean up
    clean = raw_time.upper().replace('AM', '').replace('PM', '').strip()
    parts = clean.split(':')

    try:
        if len(parts) == 3:
            h, m, s = parts
            # If the first part is suspiciously high, it's MM:SS:ms
            if int(h) >= 5:
                # Format as 'MM minutes SS.ms seconds'
                return f"{h} minutes {m}.{s.replace('.', '')} seconds"

            # If it's a normal small hour (like 1:04:52)
            return f"{h} hours {m} minutes {s} seconds"

        elif len(parts) == 2:
            m, s = parts
            return f"{m} minutes {s} seconds"

        return clean
    except ValueError:
        return None

def get_canonical_name(raw_name):
    for keyword, canonical in EVENT_NAME_MAP.items():
        if keyword.lower() in raw_name.lower():
            return canonical
    return raw_name

def fetch_race_details(race_id):
    """
    Navigates to race page.
    Returns: (date_string, dict_of_rider_times)
    """
    url = f"https://www.road-results.com/race/{race_id}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    rider_times = {}
    race_date = None

    # Hardcoded zero-based index for the 6th column
    TIME_COL_IDX = 5

    try:
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.text, 'html.parser')

        # 1. Extract Date
        title_div = soup.find('div', id='resultstitle')
        if title_div:
            direct_text = "".join([t for t in title_div.contents if isinstance(t, str)]).strip()
            parts = direct_text.split('•')
            if len(parts) >= 2:
                raw_date_str = re.sub(r'\s+', ' ', parts[1].strip())
                for fmt in ("%b %d %Y", "%B %d %Y"):
                    try:
                        race_date = datetime.strptime(raw_date_str, fmt).strftime("%Y-%m-%d")
                        break
                    except ValueError:
                        continue

        # 2. Extract Times
        # We look for rows with id starting with 'r' (e.g., id="r118892")
        rows = soup.find_all('tr', id=re.compile(r'^r\d+'))

        for row in rows:
            rider_id = int(row['id'][1:]) # Remove the 'r'
            cols = row.find_all('td')
            if len(cols) > TIME_COL_IDX:
                time_val = cols[TIME_COL_IDX].text.strip()
                if time_val:
                    rider_times[rider_id] = time_val

    except Exception as e:
        print(f"   Error fetching details for race {race_id}: {e}")

    return race_date, rider_times

def scrape_and_upload(year, gender):
    supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

    url = f"https://www.road-results.com/?n=results&sn=bumps&iframe=0&y={year}&series=B{str(year)[-2:]}_{gender}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find_all('table', {'class': 'datatable1'})[1]
    rows = table.find_all('tr')
    header_cells = rows[0].find_all(['th', 'td'])

    events_to_upsert = []
    race_column_map = {}
    # This will store {race_id: {rider_id: time_string}}
    race_time_lookups = {}
    current_idx = 0

    # Process Headers
    for cell in header_cells:
        span = int(cell.get('colspan', 1))
        link = cell.find('a')
        if link and '/race/' in link.get('href', '') and "total" not in cell.text.lower():
            race_id = int(link['href'].split('/')[-1].split('#')[0])
            raw_name = link.get('title', cell.text.strip())
            event_name = get_canonical_name(raw_name)

            # Fetch date AND times for this race
            print(f"Fetching details for: {event_name} ({race_id})...")
            event_date, times_map = fetch_race_details(race_id)
            race_time_lookups[race_id] = times_map

            events_to_upsert.append({"name": event_name})
            race_column_map[current_idx] = {
                "race_id": race_id,
                "event_name": event_name,
                "event_date": event_date
            }
        current_idx += span

    # Handle Events (Same as before)
    if not TRIAL_MODE:
        supabase.table("events").upsert(events_to_upsert, on_conflict="name").execute()
        db_events = supabase.table("events").select("id, name").execute()
        event_name_to_id = {item['name']: item['id'] for item in db_events.data}
    else:
        event_name_to_id = {e['name']: 999 for e in events_to_upsert}

    # Process Rows
    races_to_upsert = []
    riders_to_upsert = {}
    results_to_upsert = []

    for row in rows[1:]:
        cols = row.find_all('td')
        if len(cols) < 3: continue
        name_link = cols[2].find('a')
        if not name_link: continue

        r_id = int(name_link['href'].split('/')[-1])
        age_text = cols[1].find('span', {'class': 'category'}).text.strip() if cols[1].find('span') else "0"

        riders_to_upsert[r_id] = {
            "id": r_id,
            "name": name_link.text.strip().replace('\xa0', ' '),
            "gender": gender,
            "birth_year": year - int(age_text) if age_text.isdigit() and int(age_text) > 0 else None
        }

        for col_idx, info in race_column_map.items():
            if col_idx < len(cols):
                p_text = cols[col_idx].text.strip().replace('*', '').replace('-', '')
                if p_text and p_text not in ['DNS', 'DNF', '']:
                    # Get time from our lookup map
                    race_id = info["race_id"]
                    raw_time = race_time_lookups.get(race_id, {}).get(r_id)

                    actual_time = parse_to_interval(raw_time)

                    results_to_upsert.append({
                        "rider_id": r_id,
                        "race_id": race_id,
                        "points": float(p_text),
                        "race_time": actual_time,
                        "year": year
                    })

    # Prepare Race instances
    for info in race_column_map.values():
        races_to_upsert.append({
            "id": info["race_id"],
            "event_id": event_name_to_id.get(info["event_name"]),
            "year": year,
            "event_date": info["event_date"]
        })

    # Final Execution (Same as before)
    if TRIAL_MODE:
        print(f"[TRIAL] Would upsert {len(races_to_upsert)} races.")
        print("[TRIAL] Sample Race:", races_to_upsert[0] if races_to_upsert else "None")
        print(f"[TRIAL] Would upsert {len(riders_to_upsert)} riders.")
        print("[TRIAL] Sample Rider:", list(riders_to_upsert.values())[0] if riders_to_upsert else "None")
        print(f"[TRIAL] Would upsert {len(results_to_upsert)} results.")
        print(f"[TRIAL] Sample Result with Time: {results_to_upsert[0] if results_to_upsert else 'None'}")
        print("Trial mode enabled; no data was uploaded.")
    else:
        supabase.table("races").upsert(races_to_upsert).execute()
        supabase.table("riders").upsert(list(riders_to_upsert.values())).execute()
        for i in range(0, len(results_to_upsert), 500):
            supabase.table("results").upsert(results_to_upsert[i:i+500], on_conflict="rider_id, race_id").execute()
        print("Successfully uploaded all data.")

        try:
            response = supabase.rpc('refresh_all_reporting_views_standard').execute()
            print("Successfully triggered refresh.")
        except Exception as e:
            print(f"Error refreshing view: {e}")


if __name__ == "__main__":
    years = [2013, 2014, 2015, 2018, 2019, 2021, 2022, 2023, 2024, 2025]

# The loop stays exactly the same
for year in years:
    for gender in ["M", "W"]:
        scrape_and_upload(year, gender)