Skip to content

Commit aa0a049

Browse files
committed
- more logging formatting revisions
- fixed http connection error handling in dotnews pipeline - updated daily script - added a11y functions to web app - added dark/light theme to web app
1 parent 449b747 commit aa0a049

29 files changed

Lines changed: 679 additions & 387 deletions

LICENSE.md

100644100755
File mode changed.

README.md

100644100755
File mode changed.

config.py

100644100755
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -539,13 +539,13 @@ def print_config_summary(test_connections: bool = False):
539539
log("AI/LLM:")
540540
log(f" Gemini Model: {GEMINI_MODEL}")
541541
log(f" Gemini Embed Model: {GEMINI_EMBED_MODEL}")
542-
log(f" Gemini API Key: {'*' * 10}...{GEMINI_API_KEY[-4:] if GEMINI_API_KEY else 'NOT SET'}\n")
543-
log(f" Processing: LLM Max Workers = {LLM_MAX_WORKERS}")
542+
log(f" Gemini API Key: {'*' * 10}...{GEMINI_API_KEY[-4:] if GEMINI_API_KEY else 'NOT SET'}")
543+
log(f" LLM Max Workers: {LLM_MAX_WORKERS}\n")
544544
log("Google Services:")
545545
log(f" Drive Folder ID: {GOOGLE_DRIVE_FOLDER_ID or 'NOT SET'}")
546-
log(f" Drive Credentials: {GOOGLE_CREDENTIALS_PATH}")
547-
log(f" Gmail Credentials: {GMAIL_CREDENTIALS_PATH}")
548-
log(f" Gmail Token: {GMAIL_TOKEN_PATH} {'✔' if GMAIL_TOKEN_PATH and GMAIL_TOKEN_PATH.exists() else '(not found)'}\n")
546+
log(f" Drive Credentials: {GOOGLE_CREDENTIALS_PATH} {'✔' if GOOGLE_CREDENTIALS_PATH and GOOGLE_CREDENTIALS_PATH.exists() else '✗ Not found'}")
547+
log(f" Gmail Credentials: {GMAIL_CREDENTIALS_PATH} {'✔' if GMAIL_CREDENTIALS_PATH and GMAIL_CREDENTIALS_PATH.exists() else '✗ Not found'}")
548+
log(f" Gmail Token: {GMAIL_TOKEN_PATH} {'✔' if GMAIL_TOKEN_PATH and GMAIL_TOKEN_PATH.exists() else '✗ Not found'}\n")
549549
log("Directories:")
550550
log(f" Vector DB: {VECTORDB_DIR}")
551551
log(f" Data Downloads: {DATA_DOWNLOAD_DIR}\n")

main_chat/data_ingestion/boston_data_sync/boston_data_sync.py

Lines changed: 76 additions & 78 deletions
Large diffs are not rendered by default.

main_chat/data_ingestion/dotnews_downloader/download_pdf.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def extract_date_from_pdf(pdf_content: bytes) -> Optional[date]:
4646
if month:
4747
return date(int(year_str), month, int(day_str))
4848
except Exception as e:
49-
log_warning(f"Could not extract date from PDF: {e}")
49+
log_warning(f"[DOTNEWS] Could not extract date from PDF: {e}")
5050
return None
5151

5252

@@ -86,7 +86,7 @@ def load(cls, path: Path) -> "SyncState":
8686
# Old format: entire file is {renamed_filename: {...}}
8787
# We can't recover original filenames, so start fresh
8888
# but keep track of renamed files to avoid reprocessing
89-
log_info("Migrating legacy sync state format...")
89+
log_info(f"[DOTNEWS] Migrating legacy sync state format...")
9090
return cls(downloaded_files={}, last_sync=data.get("last_sync"))
9191

9292
return cls(downloaded_files=data.get("downloaded_files", {}), last_sync=data.get("last_sync"))
@@ -147,14 +147,14 @@ def generate_renamed_filename(extracted_date: Optional[date], year: int, month:
147147
def list_pdfs_in_month(year: int, month: int) -> list[tuple[str, str]]:
148148
"""List PDF files in a given month's WP uploads directory."""
149149
url = f"{DOTNEWS_BASE_URL}{DOTNEWS_WP_UPLOADS_PATH}/{year}/{month:02d}/"
150-
log_success(f"Downloading from {url}...")
150+
log_success(f"[DOTNEWS] Downloading from {url}...")
151151

152152
try:
153153
response = requests.get(url, timeout=30)
154154
response.raise_for_status()
155155
except requests.RequestException as e:
156-
log_error(f"Error fetching directory listing: {e}")
157-
return []
156+
log_error(f"[DOTNEWS] {e}")
157+
raise
158158

159159
pdfs = []
160160
matches = PDF_HREF_PATTERN.findall(response.text)
@@ -200,11 +200,11 @@ def download_pdf_content(url: str) -> Optional[bytes]:
200200
content = response.content
201201
if not content[:4].startswith(b"%PDF"):
202202
content_type = response.headers.get("Content-Type", "").lower()
203-
log_error(f"Content is not a PDF (Content-Type: {content_type})")
203+
log_error(f"[DOTNEWS] Content is not a PDF (Content-Type: {content_type})")
204204
return None
205205
return content
206206
except requests.RequestException as e:
207-
log_error(f"Error downloading: {e}")
207+
log_error(f"[DOTNEWS] Error downloading: {e}")
208208
return None
209209

210210

@@ -231,50 +231,50 @@ def download_pdfs(output_dir: Optional[Path] = None, start_year: Optional[int] =
231231
sync_state_path = config.DOTNEWS_SYNC_STATE_FILENAME
232232
sync_state = SyncState.load(sync_state_path)
233233

234-
log_debug(f"Output directory: {output_dir}")
235-
log_debug(f"Date range: {start_year}-{start_month:02d} to {end_year}-{end_month:02d}")
236-
log_debug(f"Previously downloaded: {len(sync_state.downloaded_files)} files")
234+
log_debug(f"[DOTNEWS] Output directory: {output_dir}")
235+
log_debug(f"[DOTNEWS] Date range: {start_year}-{start_month:02d} to {end_year}-{end_month:02d}")
236+
log_debug(f"[DOTNEWS] Previously downloaded: {len(sync_state.downloaded_files)} files")
237237

238238
downloaded_paths = []
239239
current = date(start_year, start_month, 1)
240240
end = date(end_year, end_month, 1)
241241

242242
while current <= end:
243243
year, month = current.year, current.month
244-
log_debug(f"Processing {year}-{month:02d}...")
244+
log_debug(f"[DOTNEWS] Processing {year}-{month:02d}...")
245245

246246
pdfs = list_pdfs_in_month(year, month)
247247
if not pdfs:
248-
log_debug(f" No PDFs found for {year}-{month:02d}")
248+
log_debug(f"[DOTNEWS] No PDFs found for {year}-{month:02d}")
249249
else:
250-
log_debug(f" Found {len(pdfs)} PDF(s)")
250+
log_debug(f"[DOTNEWS] Found {len(pdfs)} PDF(s)")
251251

252252
for original_filename, pdf_url in pdfs:
253253
# Check sync state BEFORE downloading - skip if we've seen this filename
254254
if sync_state.is_downloaded(original_filename):
255-
log_debug(f" Skipping {original_filename} (already downloaded)")
255+
log_debug(f"[DOTNEWS] Skipping {original_filename} (already downloaded)")
256256
continue
257257

258258
# Only download files we haven't seen before
259-
log_debug(f" Downloading {original_filename}...")
259+
log_debug(f"[DOTNEWS] Downloading {original_filename}...")
260260
pdf_content = download_pdf_content(pdf_url)
261261
if pdf_content is None:
262262
continue
263263

264264
# Extract date from PDF content for human-readable filename
265265
extracted_date = extract_date_from_pdf(pdf_content)
266266
if extracted_date:
267-
log_debug(f" Extracted date: {extracted_date.isoformat()}")
267+
log_debug(f"[DOTNEWS] Extracted date: {extracted_date.isoformat()}")
268268
else:
269-
log_debug(" Could not extract date, using fallback")
269+
log_debug(f"[DOTNEWS] Could not extract date, using fallback")
270270

271271
renamed_filename = generate_renamed_filename(extracted_date, year, month)
272272
output_path = output_dir / renamed_filename
273273

274274
with open(output_path, "wb") as f:
275275
f.write(pdf_content)
276276

277-
log_debug(f" Saved as: {output_path.name} ({len(pdf_content) / 1024:.1f} KB)")
277+
log_debug(f"[DOTNEWS] Saved as: {output_path.name} ({len(pdf_content) / 1024:.1f} KB)")
278278

279279
# Record original filename -> renamed filename mapping
280280
sync_state.mark_downloaded(original_filename, renamed_filename)
@@ -304,21 +304,21 @@ def download_pdfs(output_dir: Optional[Path] = None, start_year: Optional[int] =
304304
try:
305305
start_year, start_month = map(int, args.start.split("-"))
306306
except ValueError:
307-
log_error(f"Error: Invalid start date format '{args.start}'. Use YYYY-MM.")
307+
log_error(f"[DOTNEWS] Error: Invalid start date format '{args.start}'. Use YYYY-MM.")
308308
sys.exit(1)
309309

310310
end_year, end_month = None, None
311311
if args.end:
312312
try:
313313
end_year, end_month = map(int, args.end.split("-"))
314314
except ValueError:
315-
log_error(f"Error: Invalid end date format '{args.end}'. Use YYYY-MM.")
315+
log_error(f"[DOTNEWS] Error: Invalid end date format '{args.end}'. Use YYYY-MM.")
316316
sys.exit(1)
317317

318318
output_dir = Path(args.output_dir) if args.output_dir else None
319319
results = download_pdfs(output_dir=output_dir, start_year=start_year, start_month=start_month, end_year=end_year, end_month=end_month)
320320

321321
if results:
322-
log_success(f"Downloaded {len(results)} new files.")
322+
log_success(f"[DOTNEWS] Downloaded {len(results)} new files.")
323323
else:
324-
log_success("No new files downloaded.")
324+
log_success(f"[DOTNEWS] No new files downloaded.")

main_chat/data_ingestion/email_to_calendar_sql.py

Lines changed: 35 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def get_gmail_credentials(interactive: bool = True) -> Credentials:
7878
try:
7979
creds = Credentials.from_authorized_user_file(str(token_path), SCOPES)
8080
except Exception as e:
81-
log_error(f"Could not load existing token: {e}")
81+
log_error(f"[GMAIL] Could not load existing token: {e}")
8282

8383
# If valid credentials exist, return them
8484
if creds and creds.valid:
@@ -89,10 +89,10 @@ def get_gmail_credentials(interactive: bool = True) -> Credentials:
8989
try:
9090
creds.refresh(Request())
9191
token_path.write_text(creds.to_json())
92-
log_debug("✔ Token refreshed successfully")
92+
log_debug(f"[GMAIL] ✔ Token refreshed successfully")
9393
return creds
9494
except Exception as e:
95-
log_error(f"Token refresh failed: {e}")
95+
log_error(f"[GMAIL] Token refresh failed: {e}")
9696
creds = None
9797

9898
# No valid credentials - need user interaction
@@ -109,10 +109,10 @@ def get_gmail_credentials(interactive: bool = True) -> Credentials:
109109
raise AuthenticationRequiredError(auth_url)
110110

111111
# Interactive mode: open browser for user to authorize
112-
log_info("Opening browser for Gmail authorization...")
112+
log_info(f"[GMAIL] Opening browser for Gmail authorization...")
113113
creds = flow.run_local_server(port=8080, access_type="offline", prompt="consent")
114114
token_path.write_text(creds.to_json())
115-
log_success("Authorization complete, token saved")
115+
log_success(f"[GMAIL] Authorization complete, token saved")
116116
return creds
117117

118118

@@ -286,7 +286,7 @@ def extract_events_with_llm(text: str, source: str, publication_date: str = None
286286
return events
287287

288288
except Exception as e:
289-
log_error(f"Error extracting events with LLM: {e}")
289+
log_error(f"[GMAIL] Error extracting events with LLM: {e}")
290290
return []
291291

292292

@@ -349,7 +349,7 @@ def insert_events_to_db(events: List[Dict]) -> int:
349349
)
350350
inserted_count += 1
351351
except Exception as e:
352-
log_error(f"Could not insert event '{event.get('event_name')}': {e}")
352+
log_error(f"[GMAIL] Could not insert event '{event.get('event_name')}': {e}")
353353

354354
conn.commit()
355355
finally:
@@ -374,7 +374,7 @@ def sync_email_newsletters_to_sql(interactive: bool = True) -> dict:
374374
# email_errors = [e for e in errors if "EMAIL" in e.upper()]
375375
# if email_errors:
376376
# for error in email_errors:
377-
# log_debug(f"✗ Configuration error: {error}")
377+
# log_debug(f"[GMAIL] ✗ Configuration error: {error}")
378378
# stats["errors"].append(error)
379379
# return stats
380380

@@ -383,28 +383,26 @@ def sync_email_newsletters_to_sql(interactive: bool = True) -> dict:
383383
processed_ids = state.get("processed_email_ids", [])
384384

385385
# Connect to Gmail API
386-
log_debug("Connecting to Gmail API...")
387386
try:
388387
service = get_gmail_service(interactive=interactive)
389388
except AuthenticationRequiredError as e:
390389
# Non-interactive mode and auth needed
391390
stats["auth_required"] = True
392391
stats["auth_url"] = e.auth_url
393392
error_msg = f"Gmail authentication required. Visit: {e.auth_url}"
394-
log_error(f"{error_msg}")
393+
log_error(f"[GMAIL] {error_msg}")
395394
stats["errors"].append(error_msg)
396395
return stats
397396

398-
log_success("Authenticated with Gmail.")
397+
log_success(f"[GMAIL] Authenticated.")
399398

400399
# Get recent newsletters
401-
log_debug(f"Scanning inbox for newsletters from last {config.EMAIL_LOOKBACK_DAYS} days...")
400+
log_debug(f"[GMAIL] Scanning inbox for newsletters from last {config.EMAIL_LOOKBACK_DAYS} days...")
402401
newsletters = get_recent_newsletters(service, processed_ids, days_back=config.EMAIL_LOOKBACK_DAYS)
403402

404-
log_info(f"Found {len(newsletters)} new newsletters to process.")
403+
log_info(f"[GMAIL] Found {len(newsletters)} new newsletters to process.")
405404

406405
if not newsletters:
407-
log_info("No new newsletters. Exiting.")
408406
return stats
409407

410408
all_events = []
@@ -422,60 +420,58 @@ def sync_email_newsletters_to_sql(interactive: bool = True) -> dict:
422420
except Exception:
423421
pub_date = datetime.now().strftime("%Y-%m-%d")
424422

425-
log_debug(f"[{i}/{len(newsletters)}] Processing: {subject[:60]}...")
423+
log_debug(f"[GMAIL] [{i}/{len(newsletters)}] Processing: {subject[:60]}...")
426424

427425
email_text = extract_text_from_email(msg)
428426
pdf_texts = extract_pdf_attachments(msg)
429427

430428
full_text = email_text
431429
if pdf_texts:
432430
full_text += "\n\n" + "\n\n".join(pdf_texts)
433-
log_debug(f"Found {len(pdf_texts)} PDF attachment(s)")
431+
log_debug(f"[GMAIL] Found {len(pdf_texts)} PDF attachment(s)")
434432

435433
if not full_text.strip():
436-
log_debug("No text content found")
434+
log_debug(f"[GMAIL] No text content found")
437435
continue
438436

439437
events = extract_events_with_llm(full_text, source=f"Email: {subject}", publication_date=pub_date)
440438

441439
if events:
442-
log_debug(f"Extracted {len(events)} events")
440+
log_debug(f"[GMAIL] Extracted {len(events)} events")
443441
all_events.extend(events)
444442
else:
445-
log_debug("No events found")
443+
log_debug(f"[GMAIL] No events found")
446444

447445
processed_ids.append(email_id)
448446
stats["emails_processed"] += 1
449447
stats["events_extracted"] += len(events)
450448

451449
except Exception as e:
452450
error_msg = f"Error processing email {email_id}: {str(e)}"
453-
log_error(f"{error_msg}")
451+
log_error(f"[GMAIL] {error_msg}")
454452
stats["errors"].append(error_msg)
455453

456454
if all_events:
457-
log_debug(f"Inserting {len(all_events)} events into database...")
455+
log_debug(f"[GMAIL] Inserting {len(all_events)} events into database...")
458456
inserted = insert_events_to_db(all_events)
459457
stats["events_inserted"] = inserted
460-
log_success(f"Inserted {inserted} events successfully")
458+
log_success(f"[GMAIL] Inserted {inserted} events successfully")
461459

462460
state["processed_email_ids"] = processed_ids[-1000:]
463461
save_email_sync_state(state)
464-
log_success("Sync state saved.")
462+
log_success(f"[GMAIL] Sync state saved.")
465463

466464
except Exception as e:
467465
error_msg = f"Fatal error during sync: {str(e)}"
468-
log_error(f"{error_msg}")
466+
log_error(f"[GMAIL] {error_msg}")
469467
stats["errors"].append(error_msg)
470468

471-
log("\n" + "=" * 60)
472-
log(" Gmail Sync Summary")
473-
log("=" * 60)
474-
log(f" Emails processed: {stats['emails_processed']}")
475-
log(f" Events extracted: {stats['events_extracted']}")
476-
log(f" Events inserted (SQL): {stats['events_inserted']}")
477-
log(f" Errors: {len(stats['errors'])}")
478-
log("=" * 60)
469+
log("[GMAIL] Sync Summary")
470+
log("[GMAIL] " + "=" * 60)
471+
log(f"[GMAIL] Emails processed: {stats['emails_processed']}")
472+
log(f"[GMAIL] Events extracted: {stats['events_extracted']}")
473+
log(f"[GMAIL] Events inserted (SQL): {stats['events_inserted']}")
474+
log(f"[GMAIL] Errors: {len(stats['errors'])}")
479475

480476
return stats
481477

@@ -491,21 +487,21 @@ def sync_email_newsletters_to_sql(interactive: bool = True) -> dict:
491487
try:
492488
if args.auth:
493489
# Just do authentication
494-
log_debug("Running Gmail OAuth authentication...")
490+
log_debug(f"[GMAIL] Running Gmail OAuth authentication...")
495491
get_gmail_credentials(interactive=True)
496-
log_success("Authentication successful! Token saved.")
492+
log_success(f"[GMAIL] Authentication successful! Token saved.")
497493
else:
498494
# Run full sync
499495
interactive = not args.non_interactive
500496
sync_email_newsletters_to_sql(interactive=interactive)
501497
except KeyboardInterrupt:
502-
log_warning("Interrupted by user. Exiting...")
498+
log_warning(f"[GMAIL] Interrupted by user. Exiting...")
503499
sys.exit(1)
504500
except AuthenticationRequiredError as e:
505-
log_warning(f"Authentication required!")
506-
log_info(f"Visit this URL to authorize: {e.auth_url}")
507-
log_info("\nOr run: python email_to_calendar_sql.py --auth")
501+
log_warning(f"[GMAIL] Authentication required!")
502+
log_info(f"[GMAIL] Visit this URL to authorize: {e.auth_url}")
503+
log_info(f"[GMAIL] \nOr run: python email_to_calendar_sql.py --auth")
508504
sys.exit(2)
509505
except Exception as e:
510-
log_error(f"Fatal error: {e}")
506+
log_error(f"[GMAIL] Fatal error: {e}")
511507
sys.exit(1)

0 commit comments

Comments
 (0)