feat: update web search (#1101)

earayu · web-flow · commit 6bc5b6227e9c · 2025-07-11T22:41:22.000+08:00
diff --git a/aperag/api/components/schemas/web.yaml b/aperag/api/components/schemas/web.yaml
@@ -103,15 +103,12 @@ webReadRequest:
   type: object
   description: Web content reading request
   properties:
-    urls:
-      oneOf:
-        - type: string
-          description: Single URL to read
-        - type: array
-          description: List of URLs to read
-          items:
-            type: string
-      description: URL or list of URLs to read
+    url_list:
+      type: array
+      description: List of URLs to read (for single URL, use array with one element)
+      items:
+        type: string
+      example: ["https://example.com/article"]
     timeout:
       type: integer
       description: Request timeout in seconds
@@ -128,7 +125,7 @@ webReadRequest:
       default: 3
       example: 3
   required:
-    - urls
+    - url_list
 
 webReadResultItem:
   type: object
diff --git a/aperag/mcp/server.py b/aperag/mcp/server.py
@@ -211,15 +211,15 @@ async def web_search(
 
 @mcp_server.tool
 async def web_read(
-    urls: str | list[str],
+    url_list: list[str],
     timeout: int = 30,
     locale: str = "en-US",
     max_concurrent: int = 5,
 ) -> Dict[str, Any]:
     """Read and extract content from web pages.
 
     Args:
-        urls: URL (string) or list of URLs to read content from
+        url_list: List of URLs to read content from (for single URL, use array with one element)
         timeout: Request timeout in seconds (default: 30)
         locale: Browser locale (default: en-US)
         max_concurrent: Maximum concurrent requests for multiple URLs (default: 5)
@@ -233,9 +233,13 @@ async def web_read(
     try:
         api_key = get_api_key()
 
-        # Build read request
+        # Validate url_list parameter
+        if not url_list or len(url_list) == 0:
+            return {"error": "url_list parameter is required and must contain at least one URL"}
+
+        # Build read request using the correct WebReadRequest model
         read_data = {
-            "urls": urls,
+            "url_list": url_list,
             "timeout": timeout,
             "locale": locale,
             "max_concurrent": max_concurrent,
@@ -373,15 +377,15 @@ async def aperag_usage_guide() -> str:
 
 ### Web Content Reading Example:
 ```
-# Read content from web pages
+# Read content from web pages (single URL - use array with one element)
 content = web_read(
-    urls="https://example.com/article",  # single URL
+    url_list=["https://example.com/article"],  # single URL in array
     timeout=30
 )
 
-# Or read from multiple URLs
+# Read from multiple URLs
 content = web_read(
-    urls=["https://example.com/page1", "https://example.com/page2"],  # multiple URLs
+    url_list=["https://example.com/page1", "https://example.com/page2"],  # multiple URLs
     max_concurrent=2
 )
 
@@ -407,7 +411,7 @@ async def aperag_usage_guide() -> str:
 urls = [result.url for result in web_results.results]
 
 # 3. Read full content from those pages
-web_content = web_read(urls=urls, max_concurrent=2)
+web_content = web_read(url_list=urls, max_concurrent=2)
 
 # 4. Search your internal knowledge base for related information
 collections = list_collections()
diff --git a/aperag/schema/view_models.py b/aperag/schema/view_models.py
@@ -14,7 +14,7 @@
 
 # generated by datamodel-codegen:
 #   filename:  openapi.merged.yaml
-#   timestamp: 2025-07-11T10:39:35+00:00
+#   timestamp: 2025-07-11T14:29:54+00:00
 
 from __future__ import annotations
 
@@ -1772,7 +1772,11 @@ class WebReadRequest(BaseModel):
     Web content reading request
     """
 
-    urls: Union[str, list[str]] = Field(..., description='URL or list of URLs to read')
+    url_list: list[str] = Field(
+        ...,
+        description='List of URLs to read (for single URL, use array with one element)',
+        example=['https://example.com/article'],
+    )
     timeout: Optional[int] = Field(
         30, description='Request timeout in seconds', example=30
     )
diff --git a/aperag/views/web.py b/aperag/views/web.py
@@ -18,7 +18,8 @@
 
 from fastapi import APIRouter, HTTPException
 
-from aperag.schema.view_models import WebSearchRequest, WebSearchResponse
+from aperag.schema.view_models import WebReadRequest, WebReadResponse, WebSearchRequest, WebSearchResponse
+from aperag.websearch.reader.reader_service import ReaderService
 from aperag.websearch.search.search_service import SearchService
 
 logger = logging.getLogger(__name__)
@@ -182,3 +183,37 @@ async def web_search_endpoint(request: WebSearchRequest) -> WebSearchResponse:
     Results are merged and ranked automatically.
     """
     return await web_search_view(request)
+
+
+@router.post("/web/read", response_model=WebReadResponse, tags=["websearch"])
+async def web_read_endpoint(request: WebReadRequest) -> WebReadResponse:
+    """
+    Read and extract content from web pages.
+
+    Supports:
+    - Single URL or multiple URLs (use url_list array)
+    - Concurrent processing for multiple URLs
+    - Configurable timeout and locale settings
+    - Multiple reader providers (Trafilatura, JINA)
+    """
+    try:
+        # Validate url_list parameter
+        if not request.url_list or len(request.url_list) == 0:
+            raise HTTPException(
+                status_code=400, detail="url_list parameter is required and must contain at least one URL"
+            )
+
+        # Create reader service with default provider (Trafilatura)
+        reader_service = ReaderService()
+
+        # Use the reader service directly with the URL list
+        response = await reader_service.read(request)
+
+        return response
+
+    except HTTPException:
+        # Re-raise HTTP exceptions as-is
+        raise
+    except Exception as e:
+        logger.error(f"Web read endpoint failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Web read failed: {str(e)}")
diff --git a/aperag/websearch/reader/reader_service.py b/aperag/websearch/reader/reader_service.py
@@ -87,11 +87,17 @@ async def read(self, request: WebReadRequest) -> WebReadResponse:
             ReaderProviderError: If reading fails
         """
         try:
-            # Normalize URLs input
-            if isinstance(request.urls, str):
-                urls = [request.urls]
+            # Normalize URLs input from the new url_list attribute
+            if hasattr(request, "url_list") and request.url_list:
+                urls = request.url_list
+            elif hasattr(request, "urls"):
+                # Backward compatibility for old urls attribute
+                if isinstance(request.urls, str):
+                    urls = [request.urls]
+                else:
+                    urls = request.urls
             else:
-                urls = request.urls
+                raise ReaderProviderError("No URLs provided in request")
 
             if not urls:
                 raise ReaderProviderError("URLs list cannot be empty")
@@ -160,7 +166,7 @@ async def read_simple(
             ReaderProviderError: If reading fails
         """
         request = WebReadRequest(
-            urls=url,
+            url_list=[url],  # Use url_list with single URL
             timeout=timeout,
             locale=locale,
         )
@@ -191,7 +197,7 @@ async def read_batch_simple(
             ReaderProviderError: If reading fails
         """
         request = WebReadRequest(
-            urls=urls,
+            url_list=urls,  # Use url_list directly
             timeout=timeout,
             locale=locale,
             max_concurrent=max_concurrent,
diff --git a/frontend/src/api/models/index.ts b/frontend/src/api/models/index.ts
@@ -115,7 +115,6 @@ export * from './user';
 export * from './user-list';
 export * from './vector-search-params';
 export * from './web-read-request';
-export * from './web-read-request-urls';
 export * from './web-read-response';
 export * from './web-read-result-item';
 export * from './web-search-request';
diff --git a/frontend/src/api/models/web-read-request-urls.ts b/frontend/src/api/models/web-read-request-urls.ts
diff --git a/frontend/src/api/models/web-read-request.ts b/frontend/src/api/models/web-read-request.ts
@@ -13,9 +13,6 @@
  */
 
 
-// May contain unused imports in some cases
-// @ts-ignore
-import type { WebReadRequestUrls } from './web-read-request-urls';
 
 /**
  * Web content reading request
@@ -24,11 +21,11 @@ import type { WebReadRequestUrls } from './web-read-request-urls';
  */
 export interface WebReadRequest {
     /**
-     * 
-     * @type {WebReadRequestUrls}
+     * List of URLs to read (for single URL, use array with one element)
+     * @type {Array<string>}
      * @memberof WebReadRequest
      */
-    'urls': WebReadRequestUrls;
+    'url_list': Array<string>;
     /**
      * Request timeout in seconds
      * @type {number}
diff --git a/frontend/src/api/openapi.merged.yaml b/frontend/src/api/openapi.merged.yaml
@@ -5184,15 +5184,13 @@ components:
       type: object
       description: Web content reading request
       properties:
-        urls:
-          oneOf:
-            - type: string
-              description: Single URL to read
-            - type: array
-              description: List of URLs to read
-              items:
-                type: string
-          description: URL or list of URLs to read
+        url_list:
+          type: array
+          description: List of URLs to read (for single URL, use array with one element)
+          items:
+            type: string
+          example:
+            - https://example.com/article
         timeout:
           type: integer
           description: Request timeout in seconds
@@ -5209,7 +5207,7 @@ components:
           default: 3
           example: 3
       required:
-        - urls
+        - url_list
     webReadResultItem:
       type: object
       description: Individual web content reading result
diff --git a/tests/unit_test/websearch/test_llm_txt_parsing.py b/tests/unit_test/websearch/test_llm_txt_parsing.py