22import datetime as dt
33import json
44import logging
5+ import os
56import re
67from collections .abc import AsyncIterator
78from pathlib import Path
@@ -68,6 +69,70 @@ def unescape_entities(text: str) -> str:
6869
6970
7071_RE_TRAILING_TCO = re .compile (r"(https?://t\.co/\S+)$" )
72+ _SENSITIVE_KEYS = {
73+ "authorization" ,
74+ "cookie" ,
75+ "x-csrf-token" ,
76+ "x-guest-token" ,
77+ "x-twitter-auth-type" ,
78+ "ct0" ,
79+ "auth_token" ,
80+ }
81+
82+
83+ def _mask_secret (value : str , keep : int = 4 ) -> str :
84+ if not value :
85+ return ""
86+ if len (value ) <= keep * 2 :
87+ return "*" * len (value )
88+ return f"{ value [:keep ]} ...{ value [- keep :]} "
89+
90+
91+ def _redact_mapping (data : dict [str , Any ] | None ) -> dict [str , Any ]:
92+ if not data :
93+ return {}
94+ out : dict [str , Any ] = {}
95+ for k , v in data .items ():
96+ key_l = str (k ).lower ()
97+ if key_l in _SENSITIVE_KEYS or "token" in key_l or "auth" in key_l :
98+ out [str (k )] = _mask_secret (str (v ))
99+ else :
100+ out [str (k )] = v
101+ return out
102+
103+
104+ def _preview_json (value : Any , limit : int = 1000 ) -> str :
105+ try :
106+ txt = json .dumps (value , ensure_ascii = False )
107+ except Exception :
108+ txt = str (value )
109+ return txt [:limit ]
110+
111+
112+ def _parse_cookie_string (cookie_blob : str ) -> dict [str , str ]:
113+ out : dict [str , str ] = {}
114+ for part in cookie_blob .split (";" ):
115+ token = part .strip ()
116+ if not token or "=" not in token :
117+ continue
118+ k , v = token .split ("=" , 1 )
119+ out [k .strip ()] = v .strip ().strip ('"' )
120+ return out
121+
122+
123+ def _extract_cookies_from_curl (raw_curl : str ) -> dict [str , str ]:
124+ collapsed = re .sub (r"\\\s*\n" , " " , raw_curl )
125+ out : dict [str , str ] = {}
126+ patterns = [
127+ r"(?:^|\s)-b\s+'([^']+)'" ,
128+ r'(?:^|\s)-b\s+"([^"]+)"' ,
129+ r"(?:^|\s)--cookie\s+'([^']+)'" ,
130+ r'(?:^|\s)--cookie\s+"([^"]+)"' ,
131+ ]
132+ for pat in patterns :
133+ for m in re .finditer (pat , collapsed ):
134+ out .update (_parse_cookie_string (m .group (1 )))
135+ return out
71136
72137
73138def strip_trailing_tco (text : str ) -> str :
@@ -93,12 +158,18 @@ def __init__(
93158 curl_path : str = "curl.txt" ,
94159 timeout_s : float = 30.0 ,
95160 persist_last_id_path : str | None = None ,
161+ debug_http : bool | None = None ,
96162 ) -> None :
97163 self .curl_path = Path (curl_path )
98164 self .timeout_s = timeout_s
99165 self ._session : aiohttp .ClientSession | None = None
100166 self ._req : dict [str , Any ] = {}
101167 self ._last_tweet_id : int = 0
168+ self .debug_http = (
169+ (os .getenv ("XCLIENT_DEBUG_HTTP" , "" ).lower () in {"1" , "true" , "yes" })
170+ if debug_http is None
171+ else debug_http
172+ )
102173 self .persist_last_id_path = (
103174 Path (persist_last_id_path ) if persist_last_id_path else None
104175 )
@@ -114,17 +185,95 @@ def _load_curl(self) -> None:
114185 ctx = uncurl .parse_context (
115186 "" .join (line .strip () for line in raw .splitlines ())
116187 )
188+ parsed_cookies = dict (ctx .cookies ) if ctx .cookies else {}
189+ if not parsed_cookies :
190+ parsed_cookies = _extract_cookies_from_curl (raw )
117191 self ._req = {
118192 "url" : ctx .url ,
119193 "headers" : dict (ctx .headers ) if ctx .headers else {},
120- "cookies" : dict ( ctx . cookies ) if ctx . cookies else {} ,
194+ "cookies" : parsed_cookies ,
121195 "json" : json .loads (ctx .data ) if ctx .data else None ,
122196 "method" : ctx .method .upper (),
123197 }
198+ self ._log_request_health_hint ()
199+ if self .debug_http :
200+ logger .warning (
201+ "Loaded cURL: method=%s url=%s headers=%s cookies=%s has_json=%s" ,
202+ self ._req ["method" ],
203+ self ._req ["url" ],
204+ sorted (_redact_mapping (self ._req ["headers" ]).keys ()),
205+ sorted (_redact_mapping (self ._req ["cookies" ]).keys ()),
206+ self ._req ["json" ] is not None ,
207+ )
124208 except Exception as e :
125209 logger .critical ("Error reading %s: %s" , self .curl_path , e )
126210 self ._req = {}
127211
212+ def _log_request_health_hint (self ) -> None :
213+ headers = {str (k ).lower (): str (v ) for k , v in self ._req .get ("headers" , {}).items ()}
214+ cookies = {str (k ).lower (): str (v ) for k , v in self ._req .get ("cookies" , {}).items ()}
215+ warnings : list [str ] = []
216+
217+ if "authorization" not in headers :
218+ warnings .append ("missing authorization header" )
219+ if "x-csrf-token" not in headers :
220+ warnings .append ("missing x-csrf-token header" )
221+ if "auth_token" not in cookies :
222+ warnings .append ("missing auth_token cookie" )
223+ if "ct0" not in cookies :
224+ warnings .append ("missing ct0 cookie" )
225+
226+ if warnings :
227+ logger .warning ("Request auth hints: %s" , "; " .join (warnings ))
228+
229+ def _log_request_debug (self , * , url : str , method : str , json_payload : Any ) -> None :
230+ if not self .debug_http :
231+ return
232+ logger .warning (
233+ "HTTP request %s %s\n headers=%s\n cookies=%s\n json=%s" ,
234+ method ,
235+ url ,
236+ _redact_mapping (self ._req .get ("headers" )),
237+ _redact_mapping (self ._req .get ("cookies" )),
238+ _preview_json (json_payload ),
239+ )
240+
241+ def _save_http_error_snapshot (
242+ self ,
243+ * ,
244+ status : int ,
245+ url : str ,
246+ body : str ,
247+ response_headers : dict [str , str ],
248+ request_payload : Any ,
249+ ) -> Path | None :
250+ Path ("logs" ).mkdir (parents = True , exist_ok = True )
251+ ts = dt .datetime .now ().strftime ("%Y-%m-%d_%H-%M-%S" )
252+ out_path = Path ("logs" ) / f"http_error_{ status } _{ ts } .json"
253+ payload = {
254+ "timestamp" : dt .datetime .now ().isoformat (),
255+ "status" : status ,
256+ "url" : url ,
257+ "request" : {
258+ "method" : self ._req .get ("method" , "GET" ),
259+ "headers" : _redact_mapping (self ._req .get ("headers" )),
260+ "cookies" : _redact_mapping (self ._req .get ("cookies" )),
261+ "json" : request_payload ,
262+ },
263+ "response" : {
264+ "headers" : response_headers ,
265+ "body" : body ,
266+ },
267+ }
268+ try :
269+ out_path .write_text (
270+ json .dumps (payload , ensure_ascii = False , indent = 2 ), encoding = "utf-8"
271+ )
272+ return out_path
273+ except Exception as e :
274+ logger .warning ("Could not save HTTP error snapshot: %s" , e )
275+ return None
276+
128277 def _load_last_id (self ) -> None :
129278 """Load last tweet id from disk (if configured)."""
130279 if not self .persist_last_id_path :
@@ -194,14 +343,30 @@ async def fetch_raw(self, *, text: bool = False) -> dict | str:
194343 await self ._ensure_session ()
195344 assert self ._session is not None
196345 url = self ._req ["url" ]
346+ method = self ._req .get ("method" , "GET" )
197347 json_payload = self ._req .get ("json" )
348+ self ._log_request_health_hint ()
349+ self ._log_request_debug (url = url , method = method , json_payload = json_payload )
198350
199351 try :
200- async with self ._session .get ( url , json = json_payload ) as resp :
352+ async with self ._session .request ( method , url , json = json_payload ) as resp :
201353 if resp .status >= 400 :
202354 body = await resp .text ()
355+ resp_headers = {k : v for k , v in resp .headers .items ()}
356+ snapshot = self ._save_http_error_snapshot (
357+ status = resp .status ,
358+ url = url ,
359+ body = body ,
360+ response_headers = resp_headers ,
361+ request_payload = json_payload ,
362+ )
203363 logger .error (
204- "HTTP %s for %s\n Response: %s" , resp .status , url , body [:2000 ]
364+ "HTTP %s for %s\n Response headers: %s\n Response: %s\n Snapshot: %s" ,
365+ resp .status ,
366+ url ,
367+ resp_headers ,
368+ body [:2000 ],
369+ snapshot ,
205370 )
206371 return "" if text else {}
207372
@@ -519,4 +684,5 @@ async def _example_stream():
519684
520685
521686# if __name__ == "__main__":
522- # asyncio.run(_example_stream())
687+ # asyncio.run(_example_stream())
688+ # asyncio.run(_example_once())
0 commit comments