Skip to content

Commit 6bc5b62

Browse files
authored
feat: update web search (#1101)
1 parent ad6cc0a commit 6bc5b62

10 files changed

Lines changed: 310 additions & 69 deletions

File tree

aperag/api/components/schemas/web.yaml

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,12 @@ webReadRequest:
103103
type: object
104104
description: Web content reading request
105105
properties:
106-
urls:
107-
oneOf:
108-
- type: string
109-
description: Single URL to read
110-
- type: array
111-
description: List of URLs to read
112-
items:
113-
type: string
114-
description: URL or list of URLs to read
106+
url_list:
107+
type: array
108+
description: List of URLs to read (for single URL, use array with one element)
109+
items:
110+
type: string
111+
example: ["https://example.com/article"]
115112
timeout:
116113
type: integer
117114
description: Request timeout in seconds
@@ -128,7 +125,7 @@ webReadRequest:
128125
default: 3
129126
example: 3
130127
required:
131-
- urls
128+
- url_list
132129

133130
webReadResultItem:
134131
type: object

aperag/mcp/server.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -211,15 +211,15 @@ async def web_search(
211211

212212
@mcp_server.tool
213213
async def web_read(
214-
urls: str | list[str],
214+
url_list: list[str],
215215
timeout: int = 30,
216216
locale: str = "en-US",
217217
max_concurrent: int = 5,
218218
) -> Dict[str, Any]:
219219
"""Read and extract content from web pages.
220220
221221
Args:
222-
urls: URL (string) or list of URLs to read content from
222+
url_list: List of URLs to read content from (for single URL, use array with one element)
223223
timeout: Request timeout in seconds (default: 30)
224224
locale: Browser locale (default: en-US)
225225
max_concurrent: Maximum concurrent requests for multiple URLs (default: 5)
@@ -233,9 +233,13 @@ async def web_read(
233233
try:
234234
api_key = get_api_key()
235235

236-
# Build read request
236+
# Validate url_list parameter
237+
if not url_list or len(url_list) == 0:
238+
return {"error": "url_list parameter is required and must contain at least one URL"}
239+
240+
# Build read request using the correct WebReadRequest model
237241
read_data = {
238-
"urls": urls,
242+
"url_list": url_list,
239243
"timeout": timeout,
240244
"locale": locale,
241245
"max_concurrent": max_concurrent,
@@ -373,15 +377,15 @@ async def aperag_usage_guide() -> str:
373377
374378
### Web Content Reading Example:
375379
```
376-
# Read content from web pages
380+
# Read content from web pages (single URL - use array with one element)
377381
content = web_read(
378-
urls="https://example.com/article", # single URL
382+
url_list=["https://example.com/article"], # single URL in array
379383
timeout=30
380384
)
381385
382-
# Or read from multiple URLs
386+
# Read from multiple URLs
383387
content = web_read(
384-
urls=["https://example.com/page1", "https://example.com/page2"], # multiple URLs
388+
url_list=["https://example.com/page1", "https://example.com/page2"], # multiple URLs
385389
max_concurrent=2
386390
)
387391
@@ -407,7 +411,7 @@ async def aperag_usage_guide() -> str:
407411
urls = [result.url for result in web_results.results]
408412
409413
# 3. Read full content from those pages
410-
web_content = web_read(urls=urls, max_concurrent=2)
414+
web_content = web_read(url_list=urls, max_concurrent=2)
411415
412416
# 4. Search your internal knowledge base for related information
413417
collections = list_collections()

aperag/schema/view_models.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
# generated by datamodel-codegen:
1616
# filename: openapi.merged.yaml
17-
# timestamp: 2025-07-11T10:39:35+00:00
17+
# timestamp: 2025-07-11T14:29:54+00:00
1818

1919
from __future__ import annotations
2020

@@ -1772,7 +1772,11 @@ class WebReadRequest(BaseModel):
17721772
Web content reading request
17731773
"""
17741774

1775-
urls: Union[str, list[str]] = Field(..., description='URL or list of URLs to read')
1775+
url_list: list[str] = Field(
1776+
...,
1777+
description='List of URLs to read (for single URL, use array with one element)',
1778+
example=['https://example.com/article'],
1779+
)
17761780
timeout: Optional[int] = Field(
17771781
30, description='Request timeout in seconds', example=30
17781782
)

aperag/views/web.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818

1919
from fastapi import APIRouter, HTTPException
2020

21-
from aperag.schema.view_models import WebSearchRequest, WebSearchResponse
21+
from aperag.schema.view_models import WebReadRequest, WebReadResponse, WebSearchRequest, WebSearchResponse
22+
from aperag.websearch.reader.reader_service import ReaderService
2223
from aperag.websearch.search.search_service import SearchService
2324

2425
logger = logging.getLogger(__name__)
@@ -182,3 +183,37 @@ async def web_search_endpoint(request: WebSearchRequest) -> WebSearchResponse:
182183
Results are merged and ranked automatically.
183184
"""
184185
return await web_search_view(request)
186+
187+
188+
@router.post("/web/read", response_model=WebReadResponse, tags=["websearch"])
189+
async def web_read_endpoint(request: WebReadRequest) -> WebReadResponse:
190+
"""
191+
Read and extract content from web pages.
192+
193+
Supports:
194+
- Single URL or multiple URLs (use url_list array)
195+
- Concurrent processing for multiple URLs
196+
- Configurable timeout and locale settings
197+
- Multiple reader providers (Trafilatura, JINA)
198+
"""
199+
try:
200+
# Validate url_list parameter
201+
if not request.url_list or len(request.url_list) == 0:
202+
raise HTTPException(
203+
status_code=400, detail="url_list parameter is required and must contain at least one URL"
204+
)
205+
206+
# Create reader service with default provider (Trafilatura)
207+
reader_service = ReaderService()
208+
209+
# Use the reader service directly with the URL list
210+
response = await reader_service.read(request)
211+
212+
return response
213+
214+
except HTTPException:
215+
# Re-raise HTTP exceptions as-is
216+
raise
217+
except Exception as e:
218+
logger.error(f"Web read endpoint failed: {e}")
219+
raise HTTPException(status_code=500, detail=f"Web read failed: {str(e)}")

aperag/websearch/reader/reader_service.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,17 @@ async def read(self, request: WebReadRequest) -> WebReadResponse:
8787
ReaderProviderError: If reading fails
8888
"""
8989
try:
90-
# Normalize URLs input
91-
if isinstance(request.urls, str):
92-
urls = [request.urls]
90+
# Normalize URLs input from the new url_list attribute
91+
if hasattr(request, "url_list") and request.url_list:
92+
urls = request.url_list
93+
elif hasattr(request, "urls"):
94+
# Backward compatibility for old urls attribute
95+
if isinstance(request.urls, str):
96+
urls = [request.urls]
97+
else:
98+
urls = request.urls
9399
else:
94-
urls = request.urls
100+
raise ReaderProviderError("No URLs provided in request")
95101

96102
if not urls:
97103
raise ReaderProviderError("URLs list cannot be empty")
@@ -160,7 +166,7 @@ async def read_simple(
160166
ReaderProviderError: If reading fails
161167
"""
162168
request = WebReadRequest(
163-
urls=url,
169+
url_list=[url], # Use url_list with single URL
164170
timeout=timeout,
165171
locale=locale,
166172
)
@@ -191,7 +197,7 @@ async def read_batch_simple(
191197
ReaderProviderError: If reading fails
192198
"""
193199
request = WebReadRequest(
194-
urls=urls,
200+
url_list=urls, # Use url_list directly
195201
timeout=timeout,
196202
locale=locale,
197203
max_concurrent=max_concurrent,

frontend/src/api/models/index.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ export * from './user';
115115
export * from './user-list';
116116
export * from './vector-search-params';
117117
export * from './web-read-request';
118-
export * from './web-read-request-urls';
119118
export * from './web-read-response';
120119
export * from './web-read-result-item';
121120
export * from './web-search-request';

frontend/src/api/models/web-read-request-urls.ts

Lines changed: 0 additions & 24 deletions
This file was deleted.

frontend/src/api/models/web-read-request.ts

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@
1313
*/
1414

1515

16-
// May contain unused imports in some cases
17-
// @ts-ignore
18-
import type { WebReadRequestUrls } from './web-read-request-urls';
1916

2017
/**
2118
* Web content reading request
@@ -24,11 +21,11 @@ import type { WebReadRequestUrls } from './web-read-request-urls';
2421
*/
2522
export interface WebReadRequest {
2623
/**
27-
*
28-
* @type {WebReadRequestUrls}
24+
* List of URLs to read (for single URL, use array with one element)
25+
* @type {Array<string>}
2926
* @memberof WebReadRequest
3027
*/
31-
'urls': WebReadRequestUrls;
28+
'url_list': Array<string>;
3229
/**
3330
* Request timeout in seconds
3431
* @type {number}

frontend/src/api/openapi.merged.yaml

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5184,15 +5184,13 @@ components:
51845184
type: object
51855185
description: Web content reading request
51865186
properties:
5187-
urls:
5188-
oneOf:
5189-
- type: string
5190-
description: Single URL to read
5191-
- type: array
5192-
description: List of URLs to read
5193-
items:
5194-
type: string
5195-
description: URL or list of URLs to read
5187+
url_list:
5188+
type: array
5189+
description: List of URLs to read (for single URL, use array with one element)
5190+
items:
5191+
type: string
5192+
example:
5193+
- https://example.com/article
51965194
timeout:
51975195
type: integer
51985196
description: Request timeout in seconds
@@ -5209,7 +5207,7 @@ components:
52095207
default: 3
52105208
example: 3
52115209
required:
5212-
- urls
5210+
- url_list
52135211
webReadResultItem:
52145212
type: object
52155213
description: Individual web content reading result

0 commit comments

Comments
 (0)