|
| 1 | +import dagster as dg |
| 2 | +import polars as pl |
| 3 | +import requests |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | +def seasons(year: int) -> list[dict]: |
| 7 | + url = "https://labs.limitlesstcg.com/" |
| 8 | + r = requests.get(url) |
| 9 | + soup = BeautifulSoup(r.content, 'html.parser') |
| 10 | + |
| 11 | + season_header = soup.find("h2", string=lambda x: x and f"{year}" in x) |
| 12 | + tournament_list = season_header.find_next_sibling("ul") |
| 13 | + |
| 14 | + tournaments = [] |
| 15 | + for a in tournament_list.find_all("a", href=True): |
| 16 | + parts = list(a.stripped_strings) |
| 17 | + tournaments.append({ |
| 18 | + "name": parts[0], |
| 19 | + "date": parts[-1], |
| 20 | + "link": f"https://labs.limitlesstcg.com{a['href']}", |
| 21 | + }) |
| 22 | + |
| 23 | + return tournaments |
| 24 | + |
| 25 | + |
| 26 | +def build_standings(tournament: dict) -> pl.DataFrame | None: |
| 27 | + tournament_id = tournament["link"].split("/")[-2] |
| 28 | + |
| 29 | + r = requests.get(tournament["link"]) |
| 30 | + soup = BeautifulSoup(r.content, 'html.parser') |
| 31 | + |
| 32 | + table = soup.find("table", class_="data-table striped") |
| 33 | + |
| 34 | + if table: |
| 35 | + headers = ['Rank', 'Name', 'Country', 'Points', 'Record', 'OPW%', 'OOPW%', 'Deck', 'Decklist', 'Unknown'] |
| 36 | + |
| 37 | + rows = [] |
| 38 | + tbody = table.find('tbody') |
| 39 | + |
| 40 | + for tr in tbody.find_all('tr'): |
| 41 | + cells = tr.find_all('td') |
| 42 | + |
| 43 | + if len(cells) == 1: |
| 44 | + continue |
| 45 | + |
| 46 | + row_data = [] |
| 47 | + for i, td in enumerate(cells): |
| 48 | + if i == 2: |
| 49 | + img = td.find('img') |
| 50 | + if img: |
| 51 | + country = img.get('alt') or img.get('title') or '' |
| 52 | + row_data.append(country) |
| 53 | + else: |
| 54 | + row_data.append('') |
| 55 | + |
| 56 | + elif i == 7: # Deck column |
| 57 | + pokemon_imgs = td.find_all('img', class_='pokemon') |
| 58 | + if pokemon_imgs: |
| 59 | + pokemon_names = [img.get('alt', '') for img in pokemon_imgs if img.get('alt')] |
| 60 | + pokemon_string = '/'.join(pokemon_names) |
| 61 | + row_data.append(pokemon_string) |
| 62 | + else: |
| 63 | + row_data.append('') |
| 64 | + |
| 65 | + elif i == 8: # Decklist column |
| 66 | + link = td.find('a') |
| 67 | + if link: |
| 68 | + decklist_url = link.get('href', '') |
| 69 | + row_data.append(f"https://labs.limitlesstcg.com{decklist_url}" if decklist_url else '') |
| 70 | + else: |
| 71 | + row_data.append('') |
| 72 | + |
| 73 | + else: |
| 74 | + cell_text = td.get_text(strip=True) |
| 75 | + row_data.append(cell_text) |
| 76 | + |
| 77 | + rows.append(row_data) |
| 78 | + |
| 79 | + df = pl.DataFrame(rows, schema=headers, orient="row") |
| 80 | + |
| 81 | + df = df.drop("Unknown") |
| 82 | + |
| 83 | + df = df.with_columns(pl.lit(tournament_id).alias("tournament_id")) |
| 84 | + |
| 85 | + df = df.rename( |
| 86 | + { |
| 87 | + "Rank": "rank", |
| 88 | + "Name": "name", |
| 89 | + "Country": "country", |
| 90 | + "Points": "points", |
| 91 | + "Record": "record", |
| 92 | + "OPW%": "opp_win_percent", |
| 93 | + "OOPW%": "opp_opp_win_percent", |
| 94 | + "Deck": "deck", |
| 95 | + "Decklist": "decklist", |
| 96 | + }, |
| 97 | + ) |
| 98 | + |
| 99 | + df = df.cast({"rank": pl.Int16, "points": pl.Int16}) |
| 100 | + |
| 101 | + return df |
| 102 | + |
| 103 | + return None |
| 104 | + |
| 105 | + |
| 106 | +@dg.asset(kinds={"Polars"}, name="create_standings_dataframe") |
| 107 | +def create_standings_dataframe() -> pl.DataFrame: |
| 108 | + tournaments = seasons(2026) |
| 109 | + |
| 110 | + dfs = [] |
| 111 | + for t in tournaments: |
| 112 | + df = build_standings(t) |
| 113 | + if df is not None: |
| 114 | + dfs.append(df) |
| 115 | + print(f"Loaded {df.shape[0]} rows from {t['name']}") |
| 116 | + |
| 117 | + return pl.concat(dfs) |
0 commit comments