Skip to content

Commit d596e4b

Browse files
committed
add agg scripts
1 parent a567663 commit d596e4b

22 files changed

+3613
-0
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
55

66
# Custom
7+
chat*.json
8+
QueryTechnology.json
9+
hn_top.json
10+
reddit_top.json
11+
completed/
12+
failed/
713
*.xml
814
dist/
915
App_Data/

scripts/README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
Run:
2+
3+
```bash
4+
./process_news.sh
5+
```
6+
7+
If everything looks good, then run:
8+
9+
```bash
10+
./publish_posts.sh
11+
```
12+
13+
---
14+
15+
Open https://llmspy.org type CTRL+K to open the search box, then type "latest features" into Search, then navigate to the first link in the search results
16+
---
17+
18+
Update hn_top.json with the latest top 30 HN posts
19+
20+
`bash
21+
./hn_top.py
22+
```
23+
Create posts/*.json for all new posts in hn_top.json, then run:
24+
25+
```bash
26+
./process_posts.py
27+
```
28+
29+
List all new technologies found in the new posts:
30+
31+
```bash
32+
./process_technologies.py
33+
```
34+
35+
Check the new technologies against the existing ones in data/all-technologies.json and data/alias-technologies.json, then add any new ones to data/new-technologies.json. Finally, run:
36+
37+
```bash
38+
./create_technology.py "Technology Name"
39+
```
40+
41+
If no new technologies are needed, you can skip the last step.
42+
43+
```bash
44+
./publish_posts.sh
45+
```

scripts/analyze_hn_comments.py

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
#!/usr/bin/env python3
2+
"""
3+
HN Post Comment Analyzer
4+
========================
5+
Extracts the first comment tree from a Hacker News post and generates
6+
a sentiment analysis of the entire comment thread.
7+
8+
Usage:
9+
python analyze_hn_comments.py <HN_COMMENTS_URL> [--model <MODEL>]
10+
11+
Example:
12+
python analyze_hn_comments.py https://news.ycombinator.com/item?id=46978710
13+
14+
Requirements:
15+
pip install requests
16+
"""
17+
18+
import argparse
19+
import html
20+
import json
21+
import os
22+
import re
23+
import subprocess
24+
import sys
25+
from concurrent.futures import ThreadPoolExecutor, as_completed
26+
27+
import requests
28+
from utils import USER_AGENT
29+
30+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
31+
REPO_ROOT = os.path.dirname(os.path.dirname(SCRIPT_DIR))
32+
LLMS_SH = os.path.join(REPO_ROOT, "llms.sh")
33+
LLMS_MODEL = os.getenv("LLMS_MODEL", "moonshotai/kimi-k2.5")
34+
35+
HN_API = "https://hacker-news.firebaseio.com/v0/item/{}.json"
36+
SESSION = requests.Session()
37+
SESSION.headers["User-Agent"] = USER_AGENT
38+
39+
40+
def parse_hn_id(url: str) -> int:
41+
"""Extract the item ID from a Hacker News URL or raw ID."""
42+
url = url.strip().strip("'\"")
43+
if url.isdigit():
44+
return int(url)
45+
m = re.search(r"item\?id=(\d+)", url)
46+
if m:
47+
return int(m.group(1))
48+
raise ValueError(f"Cannot extract HN item ID from: {url}")
49+
50+
51+
def fetch_item(item_id: int) -> dict | None:
52+
"""Fetch a single HN item by ID."""
53+
resp = SESSION.get(HN_API.format(item_id), timeout=15)
54+
if resp.status_code != 200:
55+
return None
56+
data = resp.json()
57+
if not data or data.get("dead") or data.get("deleted"):
58+
return None
59+
return data
60+
61+
62+
def clean_html(text: str) -> str:
63+
"""Convert HN comment HTML to plain text."""
64+
if not text:
65+
return ""
66+
text = html.unescape(text)
67+
text = re.sub(r"<p>", "\n\n", text)
68+
text = re.sub(r"<br\s*/?>", "\n", text)
69+
text = re.sub(r'<a\s+href="([^"]*)"[^>]*>([^<]*)</a>', r"\2 (\1)", text)
70+
text = re.sub(r"<i>([^<]*)</i>", r"*\1*", text)
71+
text = re.sub(r"<pre><code>([\s\S]*?)</code></pre>", r"\n```\n\1\n```\n", text)
72+
text = re.sub(r"<code>([^<]*)</code>", r"`\1`", text)
73+
text = re.sub(r"<[^>]+>", "", text)
74+
return text.strip()
75+
76+
77+
def fetch_comment_tree(item_id: int, max_depth: int = 50) -> dict | None:
78+
"""Recursively fetch a comment and all its children."""
79+
item = fetch_item(item_id)
80+
if not item or item.get("type") != "comment":
81+
return None
82+
83+
comment = {
84+
"id": item["id"],
85+
"by": item.get("by", "[deleted]"),
86+
"text": clean_html(item.get("text", "")),
87+
"time": item.get("time", 0),
88+
"children": [],
89+
}
90+
91+
kid_ids = item.get("kids", [])
92+
if kid_ids and max_depth > 0:
93+
with ThreadPoolExecutor(max_workers=8) as pool:
94+
futures = {pool.submit(fetch_comment_tree, kid, max_depth - 1): kid for kid in kid_ids}
95+
results = {}
96+
for f in as_completed(futures):
97+
kid_id = futures[f]
98+
try:
99+
child = f.result()
100+
if child:
101+
results[kid_id] = child
102+
except Exception:
103+
pass
104+
# preserve original order
105+
for kid in kid_ids:
106+
if kid in results:
107+
comment["children"].append(results[kid])
108+
109+
return comment
110+
111+
112+
def collect_all_comments(post_data: dict) -> list[dict]:
113+
"""Fetch all top-level comment trees from a post."""
114+
kid_ids = post_data.get("kids", [])
115+
if not kid_ids:
116+
return []
117+
118+
comments = []
119+
total = len(kid_ids)
120+
with ThreadPoolExecutor(max_workers=8) as pool:
121+
futures = {pool.submit(fetch_comment_tree, kid): kid for kid in kid_ids}
122+
results = {}
123+
done_count = 0
124+
for f in as_completed(futures):
125+
kid_id = futures[f]
126+
done_count += 1
127+
try:
128+
tree = f.result()
129+
if tree:
130+
results[kid_id] = tree
131+
except Exception:
132+
pass
133+
if done_count % 10 == 0 or done_count == total:
134+
print(f" Fetched {done_count}/{total} top-level threads", file=sys.stderr)
135+
136+
for kid in kid_ids:
137+
if kid in results:
138+
comments.append(results[kid])
139+
return comments
140+
141+
142+
def flatten_comments(tree: dict, depth: int = 0) -> list[dict]:
143+
"""Flatten a comment tree into a list with depth info."""
144+
result = [{"by": tree["by"], "text": tree["text"], "depth": depth}]
145+
for child in tree.get("children", []):
146+
result.extend(flatten_comments(child, depth + 1))
147+
return result
148+
149+
150+
def comments_to_text(comments: list[dict], max_comments: int = 200, max_chars: int = 30000) -> str:
151+
"""Convert comment trees into a readable text block for the LLM, capped at max_comments."""
152+
lines = []
153+
count = 0
154+
for tree in comments:
155+
for c in flatten_comments(tree):
156+
if count >= max_comments:
157+
break
158+
indent = " " * c["depth"]
159+
lines.append(f"{indent}[{c['by']}]: {c['text']}")
160+
lines.append("")
161+
count += 1
162+
if count >= max_comments:
163+
break
164+
text = "\n".join(lines)
165+
if len(text) > max_chars:
166+
text = text[:max_chars] + "\n\n[...comments truncated...]"
167+
return text
168+
169+
170+
SENTIMENT_PROMPT = """\
171+
You are an expert at analyzing online discussion threads. You will receive \
172+
the full comment thread from a Hacker News post. Analyze the overall sentiment \
173+
and key themes, then produce a markdown summary.
174+
175+
Your output must be a JSON object with exactly this schema (no markdown fences, just raw JSON):
176+
177+
{
178+
"sentiment": "string — markdown-formatted sentiment analysis"
179+
}
180+
181+
The "sentiment" field should contain well-structured markdown with these sections:
182+
183+
## Overall Sentiment
184+
A 1-2 sentence summary of the overall tone (positive, negative, mixed, etc.) \
185+
with an approximate breakdown (e.g. "~60% negative, ~30% neutral, ~10% positive").
186+
187+
## Key Themes
188+
Bullet points covering the main topics and arguments being discussed.
189+
190+
## Notable Perspectives
191+
2-4 standout comments or viewpoints that represent the range of opinions, \
192+
paraphrased and attributed by username.
193+
194+
## Consensus & Disagreements
195+
What do commenters generally agree on? Where are the main fault lines?
196+
197+
Rules:
198+
- Be objective and balanced — represent all sides fairly
199+
- Use specific examples and usernames from the comments
200+
- Keep the total output under 500 words
201+
- Return ONLY valid JSON"""
202+
203+
204+
def parse_json_response(text: str) -> dict:
205+
"""Parse JSON from an LLM response."""
206+
try:
207+
return json.loads(text)
208+
except json.JSONDecodeError:
209+
pass
210+
cleaned = re.sub(r"^```(?:json)?\s*", "", text.strip())
211+
cleaned = re.sub(r"\s*```$", "", cleaned)
212+
try:
213+
return json.loads(cleaned)
214+
except json.JSONDecodeError:
215+
pass
216+
match = re.search(r"(\{[\s\S]*\})", text)
217+
if match:
218+
return json.loads(match.group(1))
219+
raise ValueError("Could not parse JSON from LLM response")
220+
221+
222+
def analyze_sentiment(post_title: str, comments_text: str, model: str) -> str:
223+
"""Use LLM to generate sentiment analysis markdown."""
224+
user_message = f"Post Title: {post_title}\n\n--- COMMENTS ---\n{comments_text}"
225+
226+
chat_request = {
227+
"model": model,
228+
"temperature": 0.3,
229+
"messages": [
230+
{"role": "system", "content": SENTIMENT_PROMPT},
231+
{"role": "user", "content": user_message},
232+
],
233+
}
234+
235+
chat_json_path = os.path.join(SCRIPT_DIR, "chat.post.comments.json")
236+
with open(chat_json_path, "w") as f:
237+
json.dump(chat_request, f, indent=2)
238+
239+
result = subprocess.run(
240+
[LLMS_SH, "--model", model, "--chat", chat_json_path, "--nohistory"],
241+
capture_output=True,
242+
text=True,
243+
cwd=REPO_ROOT,
244+
)
245+
content = result.stdout.strip()
246+
if result.returncode != 0:
247+
print(f"Error from llms.sh ({result.returncode}):\n{result.stderr}", file=sys.stderr)
248+
if content:
249+
print(f"stdout: {content}", file=sys.stderr)
250+
sys.exit(1)
251+
252+
if not content:
253+
print("Error: llms.sh returned empty response", file=sys.stderr)
254+
sys.exit(1)
255+
256+
parsed = parse_json_response(content)
257+
return parsed.get("sentiment", content)
258+
259+
260+
def main():
261+
parser = argparse.ArgumentParser(description="Analyze comments from a Hacker News post.")
262+
parser.add_argument("url", help="HN comments URL or item ID (e.g. https://news.ycombinator.com/item?id=46978710)")
263+
parser.add_argument("--model", default=LLMS_MODEL, help=f"Model name (default: {LLMS_MODEL})")
264+
parser.add_argument(
265+
"--max-chars", type=int, default=30000, help="Max chars of comments to send to LLM (default: 30000)"
266+
)
267+
args = parser.parse_args()
268+
269+
item_id = parse_hn_id(args.url)
270+
271+
# Check if post info already has sentiment analysis
272+
post_path = os.path.join(SCRIPT_DIR, "posts", f"{item_id}.json")
273+
if not os.path.exists(post_path):
274+
print(f"Error: Post file not found at {post_path}", file=sys.stderr)
275+
sys.exit(1)
276+
277+
with open(post_path, "r", encoding="utf-8") as f:
278+
post_info = json.load(f)
279+
280+
if post_info.get("sentiment") and post_info.get("top_comment"):
281+
print(f"Post {item_id} already has sentiment and top_comment", file=sys.stderr)
282+
sys.exit(0)
283+
284+
print(f"Fetching HN post {item_id} ...", file=sys.stderr)
285+
286+
post_data = fetch_item(item_id)
287+
if not post_data:
288+
print(f"Error: Could not fetch item {item_id}", file=sys.stderr)
289+
sys.exit(1)
290+
291+
post_title = post_data.get("title", f"HN Post {item_id}")
292+
kid_ids = post_data.get("kids", [])
293+
print(f"Post: {post_title} ({len(kid_ids)} top-level comments)", file=sys.stderr)
294+
295+
# Fetch first comment tree
296+
first_tree = None
297+
if kid_ids:
298+
print("Fetching first comment tree ...", file=sys.stderr)
299+
first_tree = fetch_comment_tree(kid_ids[0])
300+
301+
# Fetch all comments for sentiment analysis
302+
print("Fetching all comment threads ...", file=sys.stderr)
303+
all_comments = collect_all_comments(post_data)
304+
total_flat = sum(len(flatten_comments(c)) for c in all_comments)
305+
print(f"Total comments fetched: {total_flat}", file=sys.stderr)
306+
307+
sentiment_count = min(total_flat, 200)
308+
comments_text = comments_to_text(all_comments, max_comments=200, max_chars=args.max_chars)
309+
print(f"Using first {sentiment_count} comments for sentiment analysis", file=sys.stderr)
310+
311+
# LLM sentiment analysis
312+
print(f"Analyzing sentiment with {args.model} ...", file=sys.stderr)
313+
sentiment_md = analyze_sentiment(post_title, comments_text, args.model)
314+
315+
# Add sentiment and top_comment to existing post info
316+
post_info["sentiment"] = sentiment_md
317+
post_info["top_comment"] = first_tree
318+
319+
print(json.dumps({"sentiment": sentiment_md, "top_comment": first_tree}, indent=2))
320+
321+
with open(post_path, "w", encoding="utf-8") as f:
322+
json.dump(post_info, f, indent=2)
323+
324+
325+
if __name__ == "__main__":
326+
main()

0 commit comments

Comments
 (0)