-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbumps_import_db.py
More file actions
250 lines (206 loc) · 8.85 KB
/
bumps_import_db.py
File metadata and controls
250 lines (206 loc) · 8.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
from supabase import create_client, Client
import os
from dotenv import load_dotenv
# Specify the path to your .env.local file
dotenv_path = '.env.local'
# Load the environment variables from the specified file
load_dotenv(dotenv_path=dotenv_path)
# --- CONFIGURATION ---
TRIAL_MODE = False
SUPABASE_URL = "https://odwyxdiizeyeznymzwds.supabase.co"
SUPABASE_KEY = os.getenv('SUPBASE_SECRETY_KEY')
print(f"Using Supabase Key: {SUPABASE_KEY[:8]}...")
# --- EVENT NAME MAPPING ---
EVENT_NAME_MAP = {
"Washington": "Mt. Washington",
"Ascutney": "Mt. Ascutney",
"Greylock": "Mt. Greylock",
"Whiteface": "Whiteface Mountain",
"High Point": "High Point",
"Kearsarge": "Mt. Kearsarge",
"Prospect": "Prospect Mountain",
"Allen": "Allen Clark",
"Equinox": "Mt. Equinox",
"Okemo": "Mt. Okemo",
"Newton": "Newton's Revenge",
"Kanc": "Crank the Kanc",
"Wachusett": "Mt. Wachusett",
"Newton": "Newton's Revenge",
"Bird":"Mt. Washington Early Bird"
}
def parse_to_interval(raw_time):
if not raw_time:
return None
# 1. Clean up
clean = raw_time.upper().replace('AM', '').replace('PM', '').strip()
parts = clean.split(':')
try:
if len(parts) == 3:
h, m, s = parts
# If the first part is suspiciously high, it's MM:SS:ms
if int(h) >= 5:
# Format as 'MM minutes SS.ms seconds'
return f"{h} minutes {m}.{s.replace('.', '')} seconds"
# If it's a normal small hour (like 1:04:52)
return f"{h} hours {m} minutes {s} seconds"
elif len(parts) == 2:
m, s = parts
return f"{m} minutes {s} seconds"
return clean
except ValueError:
return None
def get_canonical_name(raw_name):
for keyword, canonical in EVENT_NAME_MAP.items():
if keyword.lower() in raw_name.lower():
return canonical
return raw_name
def fetch_race_details(race_id):
"""
Navigates to race page.
Returns: (date_string, dict_of_rider_times)
"""
url = f"https://www.road-results.com/race/{race_id}"
headers = {'User-Agent': 'Mozilla/5.0'}
rider_times = {}
race_date = None
# Hardcoded zero-based index for the 6th column
TIME_COL_IDX = 5
try:
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, 'html.parser')
# 1. Extract Date
title_div = soup.find('div', id='resultstitle')
if title_div:
direct_text = "".join([t for t in title_div.contents if isinstance(t, str)]).strip()
parts = direct_text.split('•')
if len(parts) >= 2:
raw_date_str = re.sub(r'\s+', ' ', parts[1].strip())
for fmt in ("%b %d %Y", "%B %d %Y"):
try:
race_date = datetime.strptime(raw_date_str, fmt).strftime("%Y-%m-%d")
break
except ValueError:
continue
# 2. Extract Times
# We look for rows with id starting with 'r' (e.g., id="r118892")
rows = soup.find_all('tr', id=re.compile(r'^r\d+'))
for row in rows:
rider_id = int(row['id'][1:]) # Remove the 'r'
cols = row.find_all('td')
if len(cols) > TIME_COL_IDX:
time_val = cols[TIME_COL_IDX].text.strip()
if time_val:
rider_times[rider_id] = time_val
except Exception as e:
print(f" Error fetching details for race {race_id}: {e}")
return race_date, rider_times
def scrape_and_upload(year, gender):
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
url = f"https://www.road-results.com/?n=results&sn=bumps&iframe=0&y={year}&series=B{str(year)[-2:]}_{gender}"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find_all('table', {'class': 'datatable1'})[1]
rows = table.find_all('tr')
header_cells = rows[0].find_all(['th', 'td'])
events_to_upsert = []
race_column_map = {}
# This will store {race_id: {rider_id: time_string}}
race_time_lookups = {}
current_idx = 0
# Process Headers
for cell in header_cells:
span = int(cell.get('colspan', 1))
link = cell.find('a')
if link and '/race/' in link.get('href', '') and "total" not in cell.text.lower():
race_id = int(link['href'].split('/')[-1].split('#')[0])
raw_name = link.get('title', cell.text.strip())
event_name = get_canonical_name(raw_name)
# Fetch date AND times for this race
print(f"Fetching details for: {event_name} ({race_id})...")
event_date, times_map = fetch_race_details(race_id)
race_time_lookups[race_id] = times_map
events_to_upsert.append({"name": event_name})
race_column_map[current_idx] = {
"race_id": race_id,
"event_name": event_name,
"event_date": event_date
}
current_idx += span
# Handle Events (Same as before)
if not TRIAL_MODE:
supabase.table("events").upsert(events_to_upsert, on_conflict="name").execute()
db_events = supabase.table("events").select("id, name").execute()
event_name_to_id = {item['name']: item['id'] for item in db_events.data}
else:
event_name_to_id = {e['name']: 999 for e in events_to_upsert}
# Process Rows
races_to_upsert = []
riders_to_upsert = {}
results_to_upsert = []
for row in rows[1:]:
cols = row.find_all('td')
if len(cols) < 3: continue
name_link = cols[2].find('a')
if not name_link: continue
r_id = int(name_link['href'].split('/')[-1])
age_text = cols[1].find('span', {'class': 'category'}).text.strip() if cols[1].find('span') else "0"
riders_to_upsert[r_id] = {
"id": r_id,
"name": name_link.text.strip().replace('\xa0', ' '),
"gender": gender,
"birth_year": year - int(age_text) if age_text.isdigit() and int(age_text) > 0 else None
}
for col_idx, info in race_column_map.items():
if col_idx < len(cols):
p_text = cols[col_idx].text.strip().replace('*', '').replace('-', '')
if p_text and p_text not in ['DNS', 'DNF', '']:
# Get time from our lookup map
race_id = info["race_id"]
raw_time = race_time_lookups.get(race_id, {}).get(r_id)
actual_time = parse_to_interval(raw_time)
results_to_upsert.append({
"rider_id": r_id,
"race_id": race_id,
"points": float(p_text),
"race_time": actual_time,
"year": year
})
# Prepare Race instances
for info in race_column_map.values():
races_to_upsert.append({
"id": info["race_id"],
"event_id": event_name_to_id.get(info["event_name"]),
"year": year,
"event_date": info["event_date"]
})
# Final Execution (Same as before)
if TRIAL_MODE:
print(f"[TRIAL] Would upsert {len(races_to_upsert)} races.")
print("[TRIAL] Sample Race:", races_to_upsert[0] if races_to_upsert else "None")
print(f"[TRIAL] Would upsert {len(riders_to_upsert)} riders.")
print("[TRIAL] Sample Rider:", list(riders_to_upsert.values())[0] if riders_to_upsert else "None")
print(f"[TRIAL] Would upsert {len(results_to_upsert)} results.")
print(f"[TRIAL] Sample Result with Time: {results_to_upsert[0] if results_to_upsert else 'None'}")
print("Trial mode enabled; no data was uploaded.")
else:
supabase.table("races").upsert(races_to_upsert).execute()
supabase.table("riders").upsert(list(riders_to_upsert.values())).execute()
for i in range(0, len(results_to_upsert), 500):
supabase.table("results").upsert(results_to_upsert[i:i+500], on_conflict="rider_id, race_id").execute()
print("Successfully uploaded all data.")
try:
response = supabase.rpc('refresh_all_reporting_views_standard').execute()
print("Successfully triggered refresh.")
except Exception as e:
print(f"Error refreshing view: {e}")
if __name__ == "__main__":
years = [2013, 2014, 2015, 2018, 2019, 2021, 2022, 2023, 2024, 2025]
# The loop stays exactly the same
for year in years:
for gender in ["M", "W"]:
scrape_and_upload(year, gender)