Skip to content

Commit 989d391

Browse files
committed
Merge branch 'dev'
2 parents 1e6e9ea + de91d15 commit 989d391

4 files changed

Lines changed: 174 additions & 62 deletions

File tree

example_env.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
GEMINI_API_KEY=
88
GEMINI_MODEL=gemini-2.5-flash-lite
99
# Embedding model for vector search (RAG)
10-
GEMINI_EMBED_MODEL=text-embedding-004
10+
GEMINI_EMBED_MODEL=gemini-embedding-001
1111

1212
# ============================================================================
1313
# API / Flask / WebApp configuration

main_chat/data_ingestion/boston_data_sync/boston_data_sync.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,11 @@ def get_table_schema(self, df: pd.DataFrame, table_name: str, primary_key: str)
301301
else:
302302
# String type - estimate length
303303
max_len = df[col].astype(str).str.len().max()
304-
varchar_len = min(max(max_len * 2, 255), 65535) # Reasonable max
304+
# Handle NaN (empty columns) - default to 255
305+
if pd.isna(max_len) or max_len == 0:
306+
varchar_len = 255
307+
else:
308+
varchar_len = min(max(int(max_len) * 2, 255), 65535) # Reasonable max
305309
col_def = f"`{col_clean}` VARCHAR({varchar_len})"
306310

307311
columns.append(col_def)

main_chat/data_ingestion/email_to_calendar_sql.py

Lines changed: 137 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@
3030
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
3131

3232

33+
class AuthenticationRequiredError(Exception):
34+
"""Raised when user interaction is needed for OAuth."""
35+
36+
def __init__(self, auth_url: str):
37+
self.auth_url = auth_url
38+
super().__init__(f"Authentication required. Visit: {auth_url}")
39+
40+
3341
def load_email_sync_state() -> dict:
3442
"""Load state of which emails have been processed."""
3543
if config.EMAIL_SYNC_STATE_FILE.exists():
@@ -46,58 +54,96 @@ def save_email_sync_state(state: dict) -> None:
4654
config.EMAIL_SYNC_STATE_FILE.write_text(json.dumps(state, indent=2))
4755

4856

49-
def get_gmail_credentials() -> Credentials:
50-
"""Get or refresh Gmail OAuth 2.0 credentials."""
57+
def get_gmail_credentials(interactive: bool = True) -> Credentials:
58+
"""
59+
Get or refresh Gmail OAuth 2.0 credentials.
60+
61+
Args:
62+
interactive: If True, opens browser for OAuth if needed.
63+
If False, raises AuthenticationRequiredError instead.
64+
Set to False for cron/automated runs.
65+
66+
Returns:
67+
Valid Credentials object
68+
69+
Raises:
70+
AuthenticationRequiredError: When interactive=False and user auth is needed
71+
FileNotFoundError: When OAuth credentials file is missing
72+
"""
5173
creds = None
5274
token_path = Path(config.GMAIL_TOKEN_PATH)
5375

5476
# Load existing token if available
5577
if token_path.exists():
5678
try:
5779
creds = Credentials.from_authorized_user_file(str(token_path), SCOPES)
58-
except Exception:
59-
pass
80+
except Exception as e:
81+
print(f" ⚠ Could not load existing token: {e}")
6082

61-
# If no valid credentials, go through OAuth flow
62-
if not creds or not creds.valid:
63-
if creds and creds.expired and creds.refresh_token:
64-
# Refresh the token
65-
try:
66-
creds.refresh(Request())
67-
except Exception:
68-
# If refresh fails, re-authenticate
69-
creds = None
83+
# If valid credentials exist, return them
84+
if creds and creds.valid:
85+
return creds
7086

71-
if not creds:
72-
# Run OAuth flow (opens browser for user to authorize)
73-
credentials_path = Path(config.GMAIL_CREDENTIALS_PATH)
74-
if not credentials_path.exists():
75-
raise FileNotFoundError(f"Gmail OAuth credentials not found: {config.GMAIL_CREDENTIALS_PATH}\n" "Please download credentials from Google Cloud Console.")
87+
# Try to refresh expired credentials
88+
if creds and creds.expired and creds.refresh_token:
89+
try:
90+
creds.refresh(Request())
91+
token_path.write_text(creds.to_json())
92+
print(" ✓ Token refreshed successfully")
93+
return creds
94+
except Exception as e:
95+
print(f" ⚠ Token refresh failed: {e}")
96+
creds = None
97+
98+
# No valid credentials - need user interaction
99+
credentials_path = Path(config.GMAIL_CREDENTIALS_PATH)
100+
if not credentials_path.exists():
101+
raise FileNotFoundError(f"Gmail OAuth credentials not found: {config.GMAIL_CREDENTIALS_PATH}\n" "Please download credentials from Google Cloud Console.")
102+
103+
flow = InstalledAppFlow.from_client_secrets_file(str(credentials_path), SCOPES)
104+
105+
if not interactive:
106+
# Non-interactive mode: generate URL and raise error
107+
flow.redirect_uri = "http://localhost:8080/"
108+
auth_url, _ = flow.authorization_url(access_type="offline", prompt="consent") # Request refresh token # Force consent to ensure refresh token
109+
raise AuthenticationRequiredError(auth_url)
110+
111+
# Interactive mode: open browser for user to authorize
112+
print(" Opening browser for Gmail authorization...")
113+
creds = flow.run_local_server(port=8080, access_type="offline", prompt="consent")
114+
token_path.write_text(creds.to_json())
115+
print(" ✓ Authorization complete, token saved")
116+
return creds
76117

77-
flow = InstalledAppFlow.from_client_secrets_file(str(credentials_path), SCOPES)
78-
# Use fixed port 8080 so redirect URI is predictable
79-
creds = flow.run_local_server(port=8080)
80118

81-
# Save the credentials for next run
82-
token_path.write_text(creds.to_json())
119+
def get_gmail_service(interactive: bool = True):
120+
"""
121+
Get authenticated Gmail API service.
83122
84-
return creds
123+
Args:
124+
interactive: If True, allows browser-based OAuth if needed.
125+
If False, raises AuthenticationRequiredError instead.
85126
127+
Returns:
128+
Gmail API service object
86129
87-
def get_gmail_service():
88-
"""Get authenticated Gmail API service."""
130+
Raises:
131+
AuthenticationRequiredError: When interactive=False and user auth is needed
132+
ValueError: When EMAIL_ADDRESS is not configured
133+
RuntimeError: When connection to Gmail API fails
134+
"""
89135
if not config.EMAIL_ADDRESS:
90136
raise ValueError("NEWSLETTER_EMAIL_ADDRESS not configured")
91137

92-
try:
93-
# Get OAuth credentials
94-
creds = get_gmail_credentials()
138+
# Get OAuth credentials (may raise AuthenticationRequiredError)
139+
creds = get_gmail_credentials(interactive=interactive)
95140

96-
# Build Gmail API service
141+
# Build Gmail API service
142+
try:
97143
service = build("gmail", "v1", credentials=creds)
98144
return service
99145
except Exception as e:
100-
raise RuntimeError(f"Failed to connect to Gmail API: {e}")
146+
raise RuntimeError(f"Failed to build Gmail API service: {e}")
101147

102148

103149
def get_recent_newsletters(service, processed_ids: List[str], days_back: int = 7) -> List[tuple]:
@@ -148,9 +194,8 @@ def extract_events_with_llm(text: str, source: str, publication_date: str = None
148194
Args:
149195
text: Newsletter text content
150196
source: Source identifier
151-
publication_date: Newsletter publication date in YYYY-MM-DD format (used to infer exact dates from day names)
197+
publication_date: Newsletter publication date in YYYY-MM-DD format
152198
"""
153-
154199
# Truncate very long text to avoid token limits
155200
max_chars = 15000
156201
if len(text) > max_chars:
@@ -160,8 +205,6 @@ def extract_events_with_llm(text: str, source: str, publication_date: str = None
160205
date_context = ""
161206
if publication_date:
162207
try:
163-
from datetime import datetime
164-
165208
pub_dt = datetime.strptime(publication_date, "%Y-%m-%d")
166209
date_context = f"""
167210
IMPORTANT DATE CONTEXT:
@@ -237,7 +280,6 @@ def extract_events_with_llm(text: str, source: str, publication_date: str = None
237280
if not isinstance(events, list):
238281
return []
239282

240-
# Add source to each event
241283
for event in events:
242284
event["source"] = source
243285

@@ -258,7 +300,6 @@ def insert_events_to_db(events: List[Dict]) -> int:
258300

259301
try:
260302
with conn.cursor() as cur:
261-
# Ensure weekly_events table exists
262303
cur.execute(
263304
"""
264305
CREATE TABLE IF NOT EXISTS weekly_events (
@@ -294,7 +335,17 @@ def insert_events_to_db(events: List[Dict]) -> int:
294335
)
295336
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
296337
""",
297-
(event.get("source", "email_newsletter"), None, event.get("event_name", ""), event.get("event_date", ""), event.get("start_date"), event.get("end_date"), event.get("start_time"), event.get("end_time"), event.get("raw_text", "")), # page_number not applicable for emails
338+
(
339+
event.get("source", "email_newsletter"),
340+
None,
341+
event.get("event_name", ""),
342+
event.get("event_date", ""),
343+
event.get("start_date"),
344+
event.get("end_date"),
345+
event.get("start_time"),
346+
event.get("end_time"),
347+
event.get("raw_text", ""),
348+
),
298349
)
299350
inserted_count += 1
300351
except Exception as e:
@@ -308,16 +359,29 @@ def insert_events_to_db(events: List[Dict]) -> int:
308359
return inserted_count
309360

310361

311-
def sync_email_newsletters_to_sql() -> dict:
362+
def sync_email_newsletters_to_sql(interactive: bool = True) -> dict:
312363
"""
313364
Main function to sync email newsletters to calendar SQL database.
314-
Returns summary statistics.
365+
366+
Args:
367+
interactive: If True, allows browser-based OAuth if needed.
368+
If False, returns error in stats instead of blocking.
369+
370+
Returns:
371+
Dictionary with summary statistics
315372
"""
316373
print("=" * 80)
317374
print("Starting Email Newsletter → Calendar SQL Sync")
318375
print("=" * 80)
319376

320-
stats = {"emails_processed": 0, "events_extracted": 0, "events_inserted": 0, "errors": []}
377+
stats = {
378+
"emails_processed": 0,
379+
"events_extracted": 0,
380+
"events_inserted": 0,
381+
"errors": [],
382+
"auth_required": False,
383+
"auth_url": None,
384+
}
321385

322386
try:
323387
# Validate configuration
@@ -335,7 +399,17 @@ def sync_email_newsletters_to_sql() -> dict:
335399

336400
# Connect to Gmail API
337401
print("Connecting to Gmail API...")
338-
service = get_gmail_service()
402+
try:
403+
service = get_gmail_service(interactive=interactive)
404+
except AuthenticationRequiredError as e:
405+
# Non-interactive mode and auth needed
406+
stats["auth_required"] = True
407+
stats["auth_url"] = e.auth_url
408+
error_msg = f"Gmail authentication required. Visit: {e.auth_url}"
409+
print(f"⚠ {error_msg}")
410+
stats["errors"].append(error_msg)
411+
return stats
412+
339413
print("✓ Connected successfully")
340414

341415
# Get recent newsletters
@@ -353,11 +427,9 @@ def sync_email_newsletters_to_sql() -> dict:
353427
# Process each newsletter
354428
for i, (email_id, msg) in enumerate(newsletters, 1):
355429
try:
356-
# Get subject and date
357430
subject = get_email_subject(msg)
358431
email_date = get_email_date(msg)
359432

360-
# Try to parse publication date
361433
from email.utils import parsedate_to_datetime
362434

363435
try:
@@ -367,13 +439,9 @@ def sync_email_newsletters_to_sql() -> dict:
367439

368440
print(f"\n[{i}/{len(newsletters)}] Processing: {subject[:60]}...")
369441

370-
# Extract text from email body
371442
email_text = extract_text_from_email(msg)
372-
373-
# Extract text from PDF attachments
374443
pdf_texts = extract_pdf_attachments(msg)
375444

376-
# Combine all text
377445
full_text = email_text
378446
if pdf_texts:
379447
full_text += "\n\n" + "\n\n".join(pdf_texts)
@@ -383,7 +451,6 @@ def sync_email_newsletters_to_sql() -> dict:
383451
print(" ⚠ No text content found")
384452
continue
385453

386-
# Extract events using LLM (pass publication date for date inference)
387454
events = extract_events_with_llm(full_text, source=f"Email: {subject}", publication_date=pub_date)
388455

389456
if events:
@@ -392,7 +459,6 @@ def sync_email_newsletters_to_sql() -> dict:
392459
else:
393460
print(" ⚠ No events found")
394461

395-
# Mark as processed
396462
processed_ids.append(email_id)
397463
stats["emails_processed"] += 1
398464
stats["events_extracted"] += len(events)
@@ -402,18 +468,15 @@ def sync_email_newsletters_to_sql() -> dict:
402468
print(f" ✗ {error_msg}")
403469
stats["errors"].append(error_msg)
404470

405-
# Insert all events to database (SQL only - no vector DB for events)
406471
if all_events:
407472
print(f"\nInserting {len(all_events)} events into database...")
408473
inserted = insert_events_to_db(all_events)
409474
stats["events_inserted"] = inserted
410475
print(f"✓ Inserted {inserted} events successfully")
411476

412-
# Save updated sync state (keep last 1000 IDs to prevent file from growing too large)
413477
state["processed_email_ids"] = processed_ids[-1000:]
414478
save_email_sync_state(state)
415479
print("✓ Sync state saved")
416-
print("✓ Gmail API session completed")
417480

418481
except Exception as e:
419482
error_msg = f"Fatal error during sync: {str(e)}"
@@ -432,11 +495,31 @@ def sync_email_newsletters_to_sql() -> dict:
432495

433496

434497
if __name__ == "__main__":
498+
import argparse
499+
500+
parser = argparse.ArgumentParser(description="Sync email newsletters to calendar database")
501+
parser.add_argument("--auth", action="store_true", help="Run interactive OAuth flow to authenticate with Gmail")
502+
parser.add_argument("--non-interactive", action="store_true", help="Run in non-interactive mode (for cron jobs)")
503+
args = parser.parse_args()
504+
435505
try:
436-
sync_email_newsletters_to_sql()
506+
if args.auth:
507+
# Just do authentication
508+
print("Running Gmail OAuth authentication...")
509+
get_gmail_credentials(interactive=True)
510+
print("✓ Authentication successful! Token saved.")
511+
else:
512+
# Run full sync
513+
interactive = not args.non_interactive
514+
sync_email_newsletters_to_sql(interactive=interactive)
437515
except KeyboardInterrupt:
438516
print("\n\nInterrupted by user. Exiting...")
439517
sys.exit(1)
518+
except AuthenticationRequiredError as e:
519+
print(f"\n⚠ Authentication required!")
520+
print(f"Visit this URL to authorize: {e.auth_url}")
521+
print("\nOr run: python email_to_calendar_sql.py --auth")
522+
sys.exit(2)
440523
except Exception as e:
441524
print(f"\n\nFatal error: {e}")
442525
sys.exit(1)

0 commit comments

Comments
 (0)