2020
2121API_BASE = "https://api.pageindex.ai"
2222
23+ _INTERNAL_TOOLS = frozenset ({"ToolSearch" , "Read" , "Grep" , "Glob" , "Bash" , "Edit" , "Write" })
24+
2325
2426class CloudBackend :
2527 def __init__ (self , api_key : str ):
2628 self ._api_key = api_key
2729 self ._headers = {"api_key" : api_key }
30+ self ._folder_id_cache : dict [str , str | None ] = {}
2831
2932 # ── HTTP helpers ──────────────────────────────────────────────────────
3033
@@ -38,7 +41,8 @@ def _request(self, method: str, path: str, **kwargs) -> dict:
3841 time .sleep (2 ** attempt )
3942 continue
4043 if resp .status_code != 200 :
41- raise CloudAPIError (f"Cloud API error { resp .status_code } : { resp .text } " )
44+ body = resp .text [:500 ] if resp .text else ""
45+ raise CloudAPIError (f"Cloud API error { resp .status_code } : { body } " )
4246 return resp .json () if resp .content else {}
4347 except requests .RequestException as e :
4448 if attempt == 2 :
@@ -63,12 +67,14 @@ def _enc(value: str) -> str:
6367 def create_collection (self , name : str ) -> None :
6468 self ._validate_collection_name (name )
6569 try :
66- self ._request ("POST" , "/folder/" , json = {"name" : name })
70+ resp = self ._request ("POST" , "/folder/" , json = {"name" : name })
71+ self ._folder_id_cache [name ] = resp .get ("folder" , {}).get ("id" )
6772 except CloudAPIError as e :
6873 if "403" in str (e ):
6974 logger .warning (
7075 "Folders require a Max plan. Upgrade at https://dash.pageindex.ai/subscription"
7176 )
77+ self ._folder_id_cache [name ] = None
7278 else :
7379 raise
7480
@@ -78,31 +84,33 @@ def get_or_create_collection(self, name: str) -> None:
7884 data = self ._request ("GET" , "/folders/" )
7985 for folder in data .get ("folders" , []):
8086 if folder .get ("name" ) == name :
81- self ._folder_id_cache = folder ["id" ]
87+ self ._folder_id_cache [ name ] = folder ["id" ]
8288 return
8389 resp = self ._request ("POST" , "/folder/" , json = {"name" : name })
84- self ._folder_id_cache = resp .get ("folder" , {}).get ("id" )
90+ self ._folder_id_cache [ name ] = resp .get ("folder" , {}).get ("id" )
8591 except CloudAPIError as e :
8692 if "403" in str (e ):
8793 logger .warning (
8894 "Folders require a Max plan. Documents will be stored without folder organization. "
8995 "Upgrade at https://dash.pageindex.ai/subscription"
9096 )
91- self ._folder_id_cache = None
97+ self ._folder_id_cache [ name ] = None
9298 else :
9399 raise
94100
95101 def _get_folder_id (self , name : str ) -> str | None :
96102 """Resolve collection name to folder ID. Returns None if folders not available."""
97- if hasattr ( self , ' _folder_id_cache' ) :
98- return self ._folder_id_cache
103+ if name in self . _folder_id_cache :
104+ return self ._folder_id_cache . get ( name )
99105 try :
100106 data = self ._request ("GET" , "/folders/" )
101107 for folder in data .get ("folders" , []):
102108 if folder .get ("name" ) == name :
109+ self ._folder_id_cache [name ] = folder ["id" ]
103110 return folder ["id" ]
104111 except CloudAPIError :
105112 pass
113+ self ._folder_id_cache [name ] = None
106114 return None
107115
108116 def list_collections (self ) -> list [str ]:
@@ -204,67 +212,90 @@ async def query_stream(self, collection: str, question: str,
204212 - mcp_tool_use_start: tool call started (has tool_name, server_name)
205213 - tool_use: tool call argument delta
206214 - tool_use_stop: tool call ended
215+
216+ Note: Uses synchronous ``requests`` under the hood. The blocking
217+ HTTP call and line iteration are offloaded to a thread via
218+ ``asyncio.to_thread`` so the caller's event loop is not blocked.
219+ For full async streaming consider migrating to ``httpx.AsyncClient``.
207220 """
221+ import asyncio
222+
208223 doc_id = doc_ids if doc_ids else self ._get_all_doc_ids (collection )
209- resp = requests .post (
210- f"{ API_BASE } /chat/completions/" ,
211- headers = self ._headers ,
212- json = {
213- "messages" : [{"role" : "user" , "content" : question }],
214- "doc_id" : doc_id ,
215- "stream" : True ,
216- "stream_metadata" : True ,
217- },
218- stream = True ,
219- timeout = 120 ,
220- )
221- if resp .status_code != 200 :
222- raise CloudAPIError (f"Cloud streaming error { resp .status_code } : { resp .text } " )
223-
224- current_tool_name = None
225- current_tool_args = []
226-
227- for line in resp .iter_lines (decode_unicode = True ):
228- if not line or not line .startswith ("data: " ):
229- continue
230- data_str = line [6 :]
231- if data_str .strip () == "[DONE]" :
232- break
233- try :
234- chunk = json .loads (data_str )
235- except json .JSONDecodeError :
236- continue
237-
238- meta = chunk .get ("block_metadata" , {})
239- block_type = meta .get ("type" , "" )
240- choices = chunk .get ("choices" , [])
241- delta = choices [0 ].get ("delta" , {}) if choices else {}
242- content = delta .get ("content" , "" )
243-
244- if block_type == "mcp_tool_use_start" :
245- current_tool_name = meta .get ("tool_name" , "" )
246- current_tool_args = []
247-
248- elif block_type == "tool_use" :
249- if content :
250- current_tool_args .append (content )
251-
252- elif block_type == "tool_use_stop" :
253- # Skip internal tools (ToolSearch, Read, Grep, etc.)
254- _INTERNAL_TOOLS = {"ToolSearch" , "Read" , "Grep" , "Glob" , "Bash" , "Edit" , "Write" }
255- if current_tool_name and current_tool_name not in _INTERNAL_TOOLS :
256- args_str = "" .join (current_tool_args )
257- yield QueryEvent (type = "tool_call" , data = {
258- "name" : current_tool_name ,
259- "args" : args_str ,
260- })
261- current_tool_name = None
262- current_tool_args = []
263224
264- elif block_type == "text" and content :
265- yield QueryEvent (type = "answer_delta" , data = content )
225+ headers = self ._headers
226+
227+ def _stream () -> list [tuple [str , object ]]:
228+ """Execute the blocking SSE request in a worker thread."""
229+ resp = requests .post (
230+ f"{ API_BASE } /chat/completions/" ,
231+ headers = headers ,
232+ json = {
233+ "messages" : [{"role" : "user" , "content" : question }],
234+ "doc_id" : doc_id ,
235+ "stream" : True ,
236+ "stream_metadata" : True ,
237+ },
238+ stream = True ,
239+ timeout = 120 ,
240+ )
241+ try :
242+ if resp .status_code != 200 :
243+ body = resp .text [:500 ] if resp .text else ""
244+ raise CloudAPIError (
245+ f"Cloud streaming error { resp .status_code } : { body } "
246+ )
266247
267- yield QueryEvent (type = "answer_done" , data = "" )
248+ events : list [tuple [str , object ]] = []
249+ current_tool_name = None
250+ current_tool_args : list [str ] = []
251+
252+ for line in resp .iter_lines (decode_unicode = True ):
253+ if not line or not line .startswith ("data: " ):
254+ continue
255+ data_str = line [6 :]
256+ if data_str .strip () == "[DONE]" :
257+ break
258+ try :
259+ chunk = json .loads (data_str )
260+ except json .JSONDecodeError :
261+ continue
262+
263+ meta = chunk .get ("block_metadata" , {})
264+ block_type = meta .get ("type" , "" )
265+ choices = chunk .get ("choices" , [])
266+ delta = choices [0 ].get ("delta" , {}) if choices else {}
267+ content = delta .get ("content" , "" )
268+
269+ if block_type == "mcp_tool_use_start" :
270+ current_tool_name = meta .get ("tool_name" , "" )
271+ current_tool_args = []
272+
273+ elif block_type == "tool_use" :
274+ if content :
275+ current_tool_args .append (content )
276+
277+ elif block_type == "tool_use_stop" :
278+ # Skip internal tools (ToolSearch, Read, Grep, etc.)
279+ if current_tool_name and current_tool_name not in _INTERNAL_TOOLS :
280+ args_str = "" .join (current_tool_args )
281+ events .append (("tool_call" , {
282+ "name" : current_tool_name ,
283+ "args" : args_str ,
284+ }))
285+ current_tool_name = None
286+ current_tool_args = []
287+
288+ elif block_type == "text" and content :
289+ events .append (("answer_delta" , content ))
290+
291+ events .append (("answer_done" , "" ))
292+ return events
293+ finally :
294+ resp .close ()
295+
296+ events = await asyncio .to_thread (_stream )
297+ for event_type , event_data in events :
298+ yield QueryEvent (type = event_type , data = event_data )
268299
269300 def _get_all_doc_ids (self , collection : str ) -> list [str ]:
270301 """Get all document IDs in a collection."""
0 commit comments