-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchain_4_1.py
More file actions
65 lines (55 loc) · 2.54 KB
/
chain_4_1.py
File metadata and controls
65 lines (55 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from llm_models import get_llm
from web_scraping import web_scrape_many
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnableParallel
from prompts import SUMMARY_PROMPT_TEMPLATE
RESULT_TEXT_MAX_CHARACTERS = 5000 # A reduced from 10000
def _batch_scrape(url_dicts: list) -> list: # B
"""
Receives a list of dicts, each with keys:
result_url, search_query, user_question
Scrapes all URLs concurrently in one async batch,
then returns enriched dicts with search_result_text added.
"""
urls = [x['result_url'] for x in url_dicts]
scraped_texts = web_scrape_many(urls) # C all URLs fetched simultaneously
return [
{
'search_result_text': text[:RESULT_TEXT_MAX_CHARACTERS],
'result_url': x['result_url'],
'search_query': x['search_query'],
'user_question': x['user_question']
}
for x, text in zip(url_dicts, scraped_texts)
]
def _format_summary(x: dict) -> dict: # D
return {
'summary': f"Source Url: {x['result_url']}\nSummary: {x['text_summary']}",
'user_question': x['user_question']
}
# E Per-item summarization chain — operates on a single enriched dict
_per_item_summary_chain = (
RunnableParallel(
{
'text_summary': SUMMARY_PROMPT_TEMPLATE | get_llm() | StrOutputParser(),
'result_url': lambda x: x['result_url'],
'user_question': lambda x: x['user_question']
}
)
| RunnableLambda(_format_summary)
)
# F Full chain: batch-scrape all URLs first, then summarize each item in parallel
search_result_text_and_summary_chain = (
RunnableLambda(_batch_scrape) # G scrape all URLs concurrently
| _per_item_summary_chain.map() # H summarize each item — LangChain threads
)
# A Halved token budget per LLM summarization call — speeds up each call,
# reduces cost, and smarter content extraction means less boilerplate anyway
# B Takes the full list of URL dicts from chain_3_1 in one shot
# C web_scrape_many uses httpx + asyncio.gather — all URLs fetched in parallel,
# not sequentially — this is the biggest performance win in the entire pipeline
# D Helper to format the final summary dict cleanly
# E Summarization sub-chain — unchanged from original logic
# F The new combined chain replaces the old per-URL RunnableLambda approach
# G _batch_scrape runs before .map() so all HTTP I/O is done before LLM calls start
# H .map() parallelizes the LLM summarization across the scraped items