3030SCOPES = ["https://www.googleapis.com/auth/gmail.readonly" ]
3131
3232
33+ class AuthenticationRequiredError (Exception ):
34+ """Raised when user interaction is needed for OAuth."""
35+
36+ def __init__ (self , auth_url : str ):
37+ self .auth_url = auth_url
38+ super ().__init__ (f"Authentication required. Visit: { auth_url } " )
39+
40+
3341def load_email_sync_state () -> dict :
3442 """Load state of which emails have been processed."""
3543 if config .EMAIL_SYNC_STATE_FILE .exists ():
@@ -46,58 +54,96 @@ def save_email_sync_state(state: dict) -> None:
4654 config .EMAIL_SYNC_STATE_FILE .write_text (json .dumps (state , indent = 2 ))
4755
4856
49- def get_gmail_credentials () -> Credentials :
50- """Get or refresh Gmail OAuth 2.0 credentials."""
57+ def get_gmail_credentials (interactive : bool = True ) -> Credentials :
58+ """
59+ Get or refresh Gmail OAuth 2.0 credentials.
60+
61+ Args:
62+ interactive: If True, opens browser for OAuth if needed.
63+ If False, raises AuthenticationRequiredError instead.
64+ Set to False for cron/automated runs.
65+
66+ Returns:
67+ Valid Credentials object
68+
69+ Raises:
70+ AuthenticationRequiredError: When interactive=False and user auth is needed
71+ FileNotFoundError: When OAuth credentials file is missing
72+ """
5173 creds = None
5274 token_path = Path (config .GMAIL_TOKEN_PATH )
5375
5476 # Load existing token if available
5577 if token_path .exists ():
5678 try :
5779 creds = Credentials .from_authorized_user_file (str (token_path ), SCOPES )
58- except Exception :
59- pass
80+ except Exception as e :
81+ print ( f" ⚠ Could not load existing token: { e } " )
6082
61- # If no valid credentials, go through OAuth flow
62- if not creds or not creds .valid :
63- if creds and creds .expired and creds .refresh_token :
64- # Refresh the token
65- try :
66- creds .refresh (Request ())
67- except Exception :
68- # If refresh fails, re-authenticate
69- creds = None
83+ # If valid credentials exist, return them
84+ if creds and creds .valid :
85+ return creds
7086
71- if not creds :
72- # Run OAuth flow (opens browser for user to authorize)
73- credentials_path = Path (config .GMAIL_CREDENTIALS_PATH )
74- if not credentials_path .exists ():
75- raise FileNotFoundError (f"Gmail OAuth credentials not found: { config .GMAIL_CREDENTIALS_PATH } \n " "Please download credentials from Google Cloud Console." )
87+ # Try to refresh expired credentials
88+ if creds and creds .expired and creds .refresh_token :
89+ try :
90+ creds .refresh (Request ())
91+ token_path .write_text (creds .to_json ())
92+ print (" ✓ Token refreshed successfully" )
93+ return creds
94+ except Exception as e :
95+ print (f" ⚠ Token refresh failed: { e } " )
96+ creds = None
97+
98+ # No valid credentials - need user interaction
99+ credentials_path = Path (config .GMAIL_CREDENTIALS_PATH )
100+ if not credentials_path .exists ():
101+ raise FileNotFoundError (f"Gmail OAuth credentials not found: { config .GMAIL_CREDENTIALS_PATH } \n " "Please download credentials from Google Cloud Console." )
102+
103+ flow = InstalledAppFlow .from_client_secrets_file (str (credentials_path ), SCOPES )
104+
105+ if not interactive :
106+ # Non-interactive mode: generate URL and raise error
107+ flow .redirect_uri = "http://localhost:8080/"
108+ auth_url , _ = flow .authorization_url (access_type = "offline" , prompt = "consent" ) # Request refresh token # Force consent to ensure refresh token
109+ raise AuthenticationRequiredError (auth_url )
110+
111+ # Interactive mode: open browser for user to authorize
112+ print (" Opening browser for Gmail authorization..." )
113+ creds = flow .run_local_server (port = 8080 , access_type = "offline" , prompt = "consent" )
114+ token_path .write_text (creds .to_json ())
115+ print (" ✓ Authorization complete, token saved" )
116+ return creds
76117
77- flow = InstalledAppFlow .from_client_secrets_file (str (credentials_path ), SCOPES )
78- # Use fixed port 8080 so redirect URI is predictable
79- creds = flow .run_local_server (port = 8080 )
80118
81- # Save the credentials for next run
82- token_path .write_text (creds .to_json ())
119+ def get_gmail_service (interactive : bool = True ):
120+ """
121+ Get authenticated Gmail API service.
83122
84- return creds
123+ Args:
124+ interactive: If True, allows browser-based OAuth if needed.
125+ If False, raises AuthenticationRequiredError instead.
85126
127+ Returns:
128+ Gmail API service object
86129
87- def get_gmail_service ():
88- """Get authenticated Gmail API service."""
130+ Raises:
131+ AuthenticationRequiredError: When interactive=False and user auth is needed
132+ ValueError: When EMAIL_ADDRESS is not configured
133+ RuntimeError: When connection to Gmail API fails
134+ """
89135 if not config .EMAIL_ADDRESS :
90136 raise ValueError ("NEWSLETTER_EMAIL_ADDRESS not configured" )
91137
92- try :
93- # Get OAuth credentials
94- creds = get_gmail_credentials ()
138+ # Get OAuth credentials (may raise AuthenticationRequiredError)
139+ creds = get_gmail_credentials (interactive = interactive )
95140
96- # Build Gmail API service
141+ # Build Gmail API service
142+ try :
97143 service = build ("gmail" , "v1" , credentials = creds )
98144 return service
99145 except Exception as e :
100- raise RuntimeError (f"Failed to connect to Gmail API: { e } " )
146+ raise RuntimeError (f"Failed to build Gmail API service : { e } " )
101147
102148
103149def get_recent_newsletters (service , processed_ids : List [str ], days_back : int = 7 ) -> List [tuple ]:
@@ -148,9 +194,8 @@ def extract_events_with_llm(text: str, source: str, publication_date: str = None
148194 Args:
149195 text: Newsletter text content
150196 source: Source identifier
151- publication_date: Newsletter publication date in YYYY-MM-DD format (used to infer exact dates from day names)
197+ publication_date: Newsletter publication date in YYYY-MM-DD format
152198 """
153-
154199 # Truncate very long text to avoid token limits
155200 max_chars = 15000
156201 if len (text ) > max_chars :
@@ -160,8 +205,6 @@ def extract_events_with_llm(text: str, source: str, publication_date: str = None
160205 date_context = ""
161206 if publication_date :
162207 try :
163- from datetime import datetime
164-
165208 pub_dt = datetime .strptime (publication_date , "%Y-%m-%d" )
166209 date_context = f"""
167210IMPORTANT DATE CONTEXT:
@@ -237,7 +280,6 @@ def extract_events_with_llm(text: str, source: str, publication_date: str = None
237280 if not isinstance (events , list ):
238281 return []
239282
240- # Add source to each event
241283 for event in events :
242284 event ["source" ] = source
243285
@@ -258,7 +300,6 @@ def insert_events_to_db(events: List[Dict]) -> int:
258300
259301 try :
260302 with conn .cursor () as cur :
261- # Ensure weekly_events table exists
262303 cur .execute (
263304 """
264305 CREATE TABLE IF NOT EXISTS weekly_events (
@@ -294,7 +335,17 @@ def insert_events_to_db(events: List[Dict]) -> int:
294335 )
295336 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
296337 """ ,
297- (event .get ("source" , "email_newsletter" ), None , event .get ("event_name" , "" ), event .get ("event_date" , "" ), event .get ("start_date" ), event .get ("end_date" ), event .get ("start_time" ), event .get ("end_time" ), event .get ("raw_text" , "" )), # page_number not applicable for emails
338+ (
339+ event .get ("source" , "email_newsletter" ),
340+ None ,
341+ event .get ("event_name" , "" ),
342+ event .get ("event_date" , "" ),
343+ event .get ("start_date" ),
344+ event .get ("end_date" ),
345+ event .get ("start_time" ),
346+ event .get ("end_time" ),
347+ event .get ("raw_text" , "" ),
348+ ),
298349 )
299350 inserted_count += 1
300351 except Exception as e :
@@ -308,16 +359,29 @@ def insert_events_to_db(events: List[Dict]) -> int:
308359 return inserted_count
309360
310361
311- def sync_email_newsletters_to_sql () -> dict :
362+ def sync_email_newsletters_to_sql (interactive : bool = True ) -> dict :
312363 """
313364 Main function to sync email newsletters to calendar SQL database.
314- Returns summary statistics.
365+
366+ Args:
367+ interactive: If True, allows browser-based OAuth if needed.
368+ If False, returns error in stats instead of blocking.
369+
370+ Returns:
371+ Dictionary with summary statistics
315372 """
316373 print ("=" * 80 )
317374 print ("Starting Email Newsletter → Calendar SQL Sync" )
318375 print ("=" * 80 )
319376
320- stats = {"emails_processed" : 0 , "events_extracted" : 0 , "events_inserted" : 0 , "errors" : []}
377+ stats = {
378+ "emails_processed" : 0 ,
379+ "events_extracted" : 0 ,
380+ "events_inserted" : 0 ,
381+ "errors" : [],
382+ "auth_required" : False ,
383+ "auth_url" : None ,
384+ }
321385
322386 try :
323387 # Validate configuration
@@ -335,7 +399,17 @@ def sync_email_newsletters_to_sql() -> dict:
335399
336400 # Connect to Gmail API
337401 print ("Connecting to Gmail API..." )
338- service = get_gmail_service ()
402+ try :
403+ service = get_gmail_service (interactive = interactive )
404+ except AuthenticationRequiredError as e :
405+ # Non-interactive mode and auth needed
406+ stats ["auth_required" ] = True
407+ stats ["auth_url" ] = e .auth_url
408+ error_msg = f"Gmail authentication required. Visit: { e .auth_url } "
409+ print (f"⚠ { error_msg } " )
410+ stats ["errors" ].append (error_msg )
411+ return stats
412+
339413 print ("✓ Connected successfully" )
340414
341415 # Get recent newsletters
@@ -353,11 +427,9 @@ def sync_email_newsletters_to_sql() -> dict:
353427 # Process each newsletter
354428 for i , (email_id , msg ) in enumerate (newsletters , 1 ):
355429 try :
356- # Get subject and date
357430 subject = get_email_subject (msg )
358431 email_date = get_email_date (msg )
359432
360- # Try to parse publication date
361433 from email .utils import parsedate_to_datetime
362434
363435 try :
@@ -367,13 +439,9 @@ def sync_email_newsletters_to_sql() -> dict:
367439
368440 print (f"\n [{ i } /{ len (newsletters )} ] Processing: { subject [:60 ]} ..." )
369441
370- # Extract text from email body
371442 email_text = extract_text_from_email (msg )
372-
373- # Extract text from PDF attachments
374443 pdf_texts = extract_pdf_attachments (msg )
375444
376- # Combine all text
377445 full_text = email_text
378446 if pdf_texts :
379447 full_text += "\n \n " + "\n \n " .join (pdf_texts )
@@ -383,7 +451,6 @@ def sync_email_newsletters_to_sql() -> dict:
383451 print (" ⚠ No text content found" )
384452 continue
385453
386- # Extract events using LLM (pass publication date for date inference)
387454 events = extract_events_with_llm (full_text , source = f"Email: { subject } " , publication_date = pub_date )
388455
389456 if events :
@@ -392,7 +459,6 @@ def sync_email_newsletters_to_sql() -> dict:
392459 else :
393460 print (" ⚠ No events found" )
394461
395- # Mark as processed
396462 processed_ids .append (email_id )
397463 stats ["emails_processed" ] += 1
398464 stats ["events_extracted" ] += len (events )
@@ -402,18 +468,15 @@ def sync_email_newsletters_to_sql() -> dict:
402468 print (f" ✗ { error_msg } " )
403469 stats ["errors" ].append (error_msg )
404470
405- # Insert all events to database (SQL only - no vector DB for events)
406471 if all_events :
407472 print (f"\n Inserting { len (all_events )} events into database..." )
408473 inserted = insert_events_to_db (all_events )
409474 stats ["events_inserted" ] = inserted
410475 print (f"✓ Inserted { inserted } events successfully" )
411476
412- # Save updated sync state (keep last 1000 IDs to prevent file from growing too large)
413477 state ["processed_email_ids" ] = processed_ids [- 1000 :]
414478 save_email_sync_state (state )
415479 print ("✓ Sync state saved" )
416- print ("✓ Gmail API session completed" )
417480
418481 except Exception as e :
419482 error_msg = f"Fatal error during sync: { str (e )} "
@@ -432,11 +495,31 @@ def sync_email_newsletters_to_sql() -> dict:
432495
433496
434497if __name__ == "__main__" :
498+ import argparse
499+
500+ parser = argparse .ArgumentParser (description = "Sync email newsletters to calendar database" )
501+ parser .add_argument ("--auth" , action = "store_true" , help = "Run interactive OAuth flow to authenticate with Gmail" )
502+ parser .add_argument ("--non-interactive" , action = "store_true" , help = "Run in non-interactive mode (for cron jobs)" )
503+ args = parser .parse_args ()
504+
435505 try :
436- sync_email_newsletters_to_sql ()
506+ if args .auth :
507+ # Just do authentication
508+ print ("Running Gmail OAuth authentication..." )
509+ get_gmail_credentials (interactive = True )
510+ print ("✓ Authentication successful! Token saved." )
511+ else :
512+ # Run full sync
513+ interactive = not args .non_interactive
514+ sync_email_newsletters_to_sql (interactive = interactive )
437515 except KeyboardInterrupt :
438516 print ("\n \n Interrupted by user. Exiting..." )
439517 sys .exit (1 )
518+ except AuthenticationRequiredError as e :
519+ print (f"\n ⚠ Authentication required!" )
520+ print (f"Visit this URL to authorize: { e .auth_url } " )
521+ print ("\n Or run: python email_to_calendar_sql.py --auth" )
522+ sys .exit (2 )
440523 except Exception as e :
441524 print (f"\n \n Fatal error: { e } " )
442525 sys .exit (1 )
0 commit comments