Skip to content

Commit 814bafb

Browse files
committed
feat(server): update full_content scrapping
1 parent 05a1daf commit 814bafb

23 files changed

Lines changed: 245 additions & 12 deletions
-2.94 KB
Binary file not shown.
-13.5 KB
Binary file not shown.
-6.48 KB
Binary file not shown.
-13.1 KB
Binary file not shown.

server/database.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ def setup_database(self):
4242
source_site TEXT NOT NULL,
4343
title TEXT NOT NULL,
4444
description TEXT,
45+
full_content TEXT,
46+
content_hash TEXT UNIQUE,
4547
author_info TEXT,
4648
keywords TEXT,
4749
content_url TEXT NOT NULL,
@@ -74,6 +76,9 @@ def setup_database(self):
7476
)
7577
""")
7678

79+
cur.execute("CREATE INDEX IF NOT EXISTS idx_content_hash ON articles(content_hash)")
80+
cur.execute("CREATE INDEX IF NOT EXISTS idx_source_site ON articles(source_site)")
81+
7782
conn.commit()
7883

7984
def save_article(self, item: Dict, conn: Optional[sqlite3.Connection] = None) -> bool:
@@ -87,6 +92,8 @@ def save_article(self, item: Dict, conn: Optional[sqlite3.Connection] = None) ->
8792
Returns:
8893
True if inserted, False if already exists
8994
"""
95+
import hashlib
96+
9097
should_close = False
9198
if conn is None:
9299
conn = sqlite3.connect(self.db_path)
@@ -95,15 +102,20 @@ def save_article(self, item: Dict, conn: Optional[sqlite3.Connection] = None) ->
95102
try:
96103
cur = conn.cursor()
97104

105+
full_content = item.get("full_content", item.get("description", ""))
106+
content_hash = hashlib.sha256(full_content.encode()).hexdigest()
107+
98108
cur.execute("""
99109
INSERT OR IGNORE INTO articles
100-
(id, source_site, title, description, author_info, keywords, content_url, published_date, item_type, created_at)
101-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
110+
(id, source_site, title, description, full_content, content_hash, author_info, keywords, content_url, published_date, item_type, created_at)
111+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
102112
""", (
103113
item["id"],
104114
item["source_site"],
105115
item["title"],
106116
item.get("description", ""),
117+
full_content,
118+
content_hash,
107119
item.get("author_info", ""),
108120
item.get("keywords", ""),
109121
item["content_url"],
@@ -137,12 +149,43 @@ def save_articles_batch(self, items: List[Dict]) -> int:
137149
return count
138150

139151
def article_exists(self, article_id: str) -> bool:
140-
"""Check if article already exists."""
152+
"""Check if article already exists by ID."""
141153
with self.get_connection() as conn:
142154
cur = conn.cursor()
143155
cur.execute("SELECT 1 FROM articles WHERE id = ?", (article_id,))
144156
return cur.fetchone() is not None
145157

158+
def article_exists_by_hash(self, content_hash: str) -> bool:
159+
"""
160+
Check if article already exists by content hash.
161+
162+
Args:
163+
content_hash: SHA256 hash of article content
164+
165+
Returns:
166+
True if article with same content exists
167+
"""
168+
with self.get_connection() as conn:
169+
cur = conn.cursor()
170+
cur.execute("SELECT 1 FROM articles WHERE content_hash = ?", (content_hash,))
171+
return cur.fetchone() is not None
172+
173+
def get_article_by_hash(self, content_hash: str) -> Optional[Dict]:
174+
"""
175+
Get article by content hash.
176+
177+
Args:
178+
content_hash: SHA256 hash of article content
179+
180+
Returns:
181+
Article dict if found, None otherwise
182+
"""
183+
with self.get_connection() as conn:
184+
cur = conn.cursor()
185+
cur.execute("SELECT * FROM articles WHERE content_hash = ?", (content_hash,))
186+
row = cur.fetchone()
187+
return dict(row) if row else None
188+
146189
def save_embedding(self, article_id: str, embedding: bytes, model: str = "default") -> bool:
147190
"""
148191
Save article embedding.

server/embeddings.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,15 +111,21 @@ def embed_article(self, article: dict) -> bytes:
111111
"""
112112
Generate embedding for complete article.
113113
114+
Uses full_content if available, otherwise combines title and description.
115+
114116
Args:
115-
article: Dict with title and description
117+
article: Dict with title, description, and optional full_content
116118
117119
Returns:
118120
Serialized embedding in bytes
119121
"""
120-
title = article.get("title", "")
121-
description = article.get("description", "")
122-
text = f"{title}\n{description}"
122+
full_content = article.get("full_content")
123+
if full_content:
124+
text = full_content
125+
else:
126+
title = article.get("title", "")
127+
description = article.get("description", "")
128+
text = f"{title}\n{description}"
123129

124130
return self.embed_text(text)
125131

server/main.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,21 @@ def _process_articles(self, articles: List[Dict]) -> int:
105105
Returns:
106106
Number of new articles processed
107107
"""
108+
import hashlib
109+
108110
new_count = 0
109111

110112
for article in articles:
111113
if self.db_manager.article_exists(article["id"]):
112114
continue
113115

116+
full_content = article.get("full_content", article.get("description", ""))
117+
content_hash = hashlib.sha256(full_content.encode()).hexdigest()
118+
119+
if self.db_manager.article_exists_by_hash(content_hash):
120+
logger.debug(f"Article {article['id']} is duplicate (same content hash)")
121+
continue
122+
114123
if self.db_manager.save_article(article):
115124
new_count += 1
116125

server/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@ feedparser==6.0.10
22
requests==2.31.0
33
arxiv==1.4.8
44
numpy==1.26.4
5+
beautifulsoup4==4.12.2
6+
PyPDF2==3.0.1
-156 Bytes
Binary file not shown.
-4.2 KB
Binary file not shown.

0 commit comments

Comments
 (0)