Skip to content

Commit 36fa1e7

Browse files
committed
feat: implement backfill job
1 parent e01e006 commit 36fa1e7

4 files changed

Lines changed: 193 additions & 0 deletions

File tree

app/modules/competitive/controllers/pro_matches_controller.rb

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,53 @@ def recover_missing
306306
status: :service_unavailable
307307
end
308308

309+
# POST /api/v1/competitive/pro-matches/historical-backfill
310+
# Trigger a full historical backfill: scraper imports from Leaguepedia → ES,
311+
# then syncs into the Rails DB. The job runs in the background via Sidekiq.
312+
#
313+
# The scraper's backfill is resumable — calling this endpoint multiple times
314+
# is safe and will only process new/failed tournaments.
315+
#
316+
# @param league [String] optional — league slug (default from env BACKFILL_LEAGUE)
317+
# @param min_year [Integer] optional — earliest year (default from env BACKFILL_MIN_YEAR)
318+
def historical_backfill
319+
job = HistoricalBackfillJob.perform_later
320+
321+
scraper = ProStaffScraperService.new
322+
status = begin
323+
scraper.historical_backfill_status(league: params.fetch(:league, ENV.fetch('BACKFILL_LEAGUE', 'CBLOL')))
324+
rescue ProStaffScraperService::ScraperError => e
325+
{ error: e.message }
326+
end
327+
328+
render json: {
329+
message: 'Historical backfill job enqueued',
330+
data: {
331+
job_id: job.job_id,
332+
league: params.fetch(:league, ENV.fetch('BACKFILL_LEAGUE', 'CBLOL')),
333+
current_status: status
334+
}
335+
}, status: :accepted
336+
end
337+
338+
# GET /api/v1/competitive/pro-matches/historical-backfill/status
339+
# Check the current progress of the historical backfill on the scraper.
340+
def historical_backfill_status
341+
league = params.fetch(:league, ENV.fetch('BACKFILL_LEAGUE', 'CBLOL'))
342+
343+
scraper = ProStaffScraperService.new
344+
status = scraper.historical_backfill_status(league: league)
345+
346+
render json: {
347+
message: 'Backfill status retrieved',
348+
data: status
349+
}
350+
rescue ProStaffScraperService::ScraperError => e
351+
render json: {
352+
error: { code: 'SCRAPER_ERROR', message: e.message }
353+
}, status: :service_unavailable
354+
end
355+
309356
# POST /api/v1/competitive/pro-matches/import
310357
# Import a match from PandaScore to our database
311358
def import
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# frozen_string_literal: true
2+
3+
module Competitive
4+
# Scheduled Sidekiq job that orchestrates the full historical backfill pipeline:
5+
#
6+
# 1. Triggers the historical backfill on the ProStaff Scraper (Leaguepedia → ES)
7+
# 2. Polls the scraper's backfill status until it finishes or times out
8+
# 3. Syncs the newly indexed matches from ES into the Rails DB
9+
#
10+
# The scraper's backfill is resumable — re-triggering it skips already-completed
11+
# tournaments. This means the job is safe to run on a schedule (e.g. daily):
12+
# first runs import the full history (~8-12h for CBLOL), subsequent runs only
13+
# process new or previously-failed tournaments (minutes).
14+
#
15+
# Configuration (environment variables):
16+
# BACKFILL_LEAGUE — league to backfill (default: 'CBLOL')
17+
# BACKFILL_MIN_YEAR — earliest year to import (default: 2013)
18+
# BACKFILL_OUR_TEAM — team name for the sync step (default: 'paiN Gaming')
19+
# BACKFILL_SYNC_LIMIT — max matches to sync per run (default: 500)
20+
#
21+
# @example Run manually from console
22+
# Competitive::HistoricalBackfillJob.perform_later
23+
#
24+
# @example Check backfill progress
25+
# ProStaffScraperService.new.historical_backfill_status(league: 'CBLOL')
26+
#
27+
class HistoricalBackfillJob < ApplicationJob
28+
queue_as :low_priority
29+
30+
# The scraper may be temporarily unavailable — retry after 10 minutes.
31+
retry_on ProStaffScraperService::UnavailableError, wait: 10.minutes, attempts: 3
32+
discard_on ProStaffScraperService::UnauthorizedError
33+
34+
# How often to poll the scraper for backfill progress (seconds).
35+
POLL_INTERVAL = 5.minutes
36+
37+
# Maximum time to wait for the scraper backfill to finish before
38+
# proceeding to the sync step anyway. The scraper's backfill is
39+
# resumable, so the next scheduled run will pick up where it left off.
40+
MAX_WAIT_TIME = 6.hours
41+
42+
def perform
43+
league = ENV.fetch('BACKFILL_LEAGUE', 'CBLOL')
44+
min_year = ENV.fetch('BACKFILL_MIN_YEAR', '2013').to_i
45+
our_team = ENV.fetch('BACKFILL_OUR_TEAM', 'paiN Gaming')
46+
sync_limit = ENV.fetch('BACKFILL_SYNC_LIMIT', '500').to_i
47+
48+
scraper = ProStaffScraperService.new
49+
50+
# Step 1: Trigger the backfill on the scraper (returns immediately).
51+
Rails.logger.info(
52+
"[HistoricalBackfillJob] Triggering backfill on scraper: " \
53+
"league=#{league} min_year=#{min_year}"
54+
)
55+
56+
begin
57+
trigger_result = scraper.trigger_historical_backfill(
58+
league: league,
59+
min_year: min_year
60+
)
61+
Rails.logger.info(
62+
"[HistoricalBackfillJob] Scraper responded: #{trigger_result.inspect}"
63+
)
64+
rescue ProStaffScraperService::ScraperError => e
65+
Rails.logger.warn(
66+
"[HistoricalBackfillJob] Scraper trigger failed: #{e.message}. " \
67+
"Proceeding to sync step (scraper may already be running)."
68+
)
69+
end
70+
71+
# Step 2: Poll backfill status until completion or timeout.
72+
Rails.logger.info(
73+
"[HistoricalBackfillJob] Polling backfill status (max #{MAX_WAIT_TIME / 60}min)..."
74+
)
75+
76+
started_at = Time.current
77+
last_status = nil
78+
79+
loop do
80+
elapsed = Time.current - started_at
81+
if elapsed > MAX_WAIT_TIME
82+
Rails.logger.warn(
83+
"[HistoricalBackfillJob] Max wait time exceeded (#{MAX_WAIT_TIME / 3600}h). " \
84+
"Proceeding to sync step. Last status: #{last_status&.inspect}"
85+
)
86+
break
87+
end
88+
89+
begin
90+
last_status = scraper.historical_backfill_status(league: league)
91+
remaining = last_status['remaining'] || 0
92+
completed = last_status['completed'] || 0
93+
total = last_status['total_tournaments'] || 0
94+
95+
Rails.logger.info(
96+
"[HistoricalBackfillJob] Progress: #{completed}/#{total} tournaments " \
97+
"(#{remaining} remaining)"
98+
)
99+
100+
# If nothing is pending/in-progress, the backfill is done.
101+
break if remaining == 0
102+
rescue ProStaffScraperService::ScraperError => e
103+
Rails.logger.warn(
104+
"[HistoricalBackfillJob] Status poll failed: #{e.message}"
105+
)
106+
end
107+
108+
sleep POLL_INTERVAL
109+
end
110+
111+
# Step 3: Sync matches from ES into Rails DB for all organizations.
112+
Rails.logger.info(
113+
"[HistoricalBackfillJob] Starting sync step: " \
114+
"league=#{league} our_team=#{our_team} limit=#{sync_limit}"
115+
)
116+
117+
Organization.find_each do |org|
118+
Rails.logger.info(
119+
"[HistoricalBackfillJob] Syncing for org=#{org.id} (#{org.name})"
120+
)
121+
SyncScraperMatchesJob.perform_later(
122+
org.id,
123+
league: league,
124+
our_team: our_team,
125+
limit: sync_limit
126+
)
127+
end
128+
129+
record_job_heartbeat
130+
131+
Rails.logger.info("[HistoricalBackfillJob] Done.")
132+
end
133+
end
134+
end

config/routes.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@
281281
post 'sync-from-leaguepedia', action: :sync_from_leaguepedia
282282
get 'diagnose-missing', action: :diagnose_missing
283283
post 'recover-missing', action: :recover_missing
284+
post 'historical-backfill', action: :historical_backfill
285+
get 'historical-backfill/status', action: :historical_backfill_status
284286
end
285287
end
286288

config/sidekiq.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,16 @@
3535
class: Analytics::RefreshMetadataViewsJob
3636
description: 'Refresh materialized views for database metadata (table privileges, extensions, policies)'
3737

38+
# Historical backfill: triggers scraper backfill (Leaguepedia → ES) then
39+
# syncs new matches into Rails DB. Resumable — only processes pending
40+
# tournaments. First run imports full history (~8-12h), subsequent runs
41+
# take minutes. Runs daily at 4 AM UTC.
42+
historical_backfill:
43+
cron: '0 4 * * *'
44+
class: Competitive::HistoricalBackfillJob
45+
queue: low_priority
46+
description: 'Trigger historical backfill on scraper and sync matches to Rails DB'
47+
3848
# Additional scheduled jobs can be added here
3949
# Example:
4050
# sync_riot_data:

0 commit comments

Comments
 (0)