@@ -42,6 +42,8 @@ def setup_database(self):
4242 source_site TEXT NOT NULL,
4343 title TEXT NOT NULL,
4444 description TEXT,
45+ full_content TEXT,
46+ content_hash TEXT UNIQUE,
4547 author_info TEXT,
4648 keywords TEXT,
4749 content_url TEXT NOT NULL,
@@ -74,6 +76,9 @@ def setup_database(self):
7476 )
7577 """ )
7678
79+ cur .execute ("CREATE INDEX IF NOT EXISTS idx_content_hash ON articles(content_hash)" )
80+ cur .execute ("CREATE INDEX IF NOT EXISTS idx_source_site ON articles(source_site)" )
81+
7782 conn .commit ()
7883
7984 def save_article (self , item : Dict , conn : Optional [sqlite3 .Connection ] = None ) -> bool :
@@ -87,6 +92,8 @@ def save_article(self, item: Dict, conn: Optional[sqlite3.Connection] = None) ->
8792 Returns:
8893 True if inserted, False if already exists
8994 """
95+ import hashlib
96+
9097 should_close = False
9198 if conn is None :
9299 conn = sqlite3 .connect (self .db_path )
@@ -95,15 +102,20 @@ def save_article(self, item: Dict, conn: Optional[sqlite3.Connection] = None) ->
95102 try :
96103 cur = conn .cursor ()
97104
105+ full_content = item .get ("full_content" , item .get ("description" , "" ))
106+ content_hash = hashlib .sha256 (full_content .encode ()).hexdigest ()
107+
98108 cur .execute ("""
99109 INSERT OR IGNORE INTO articles
100- (id, source_site, title, description, author_info, keywords, content_url, published_date, item_type, created_at)
101- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
110+ (id, source_site, title, description, full_content, content_hash, author_info, keywords, content_url, published_date, item_type, created_at)
111+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )
102112 """ , (
103113 item ["id" ],
104114 item ["source_site" ],
105115 item ["title" ],
106116 item .get ("description" , "" ),
117+ full_content ,
118+ content_hash ,
107119 item .get ("author_info" , "" ),
108120 item .get ("keywords" , "" ),
109121 item ["content_url" ],
@@ -137,12 +149,43 @@ def save_articles_batch(self, items: List[Dict]) -> int:
137149 return count
138150
139151 def article_exists (self , article_id : str ) -> bool :
140- """Check if article already exists."""
152+ """Check if article already exists by ID ."""
141153 with self .get_connection () as conn :
142154 cur = conn .cursor ()
143155 cur .execute ("SELECT 1 FROM articles WHERE id = ?" , (article_id ,))
144156 return cur .fetchone () is not None
145157
158+ def article_exists_by_hash (self , content_hash : str ) -> bool :
159+ """
160+ Check if article already exists by content hash.
161+
162+ Args:
163+ content_hash: SHA256 hash of article content
164+
165+ Returns:
166+ True if article with same content exists
167+ """
168+ with self .get_connection () as conn :
169+ cur = conn .cursor ()
170+ cur .execute ("SELECT 1 FROM articles WHERE content_hash = ?" , (content_hash ,))
171+ return cur .fetchone () is not None
172+
173+ def get_article_by_hash (self , content_hash : str ) -> Optional [Dict ]:
174+ """
175+ Get article by content hash.
176+
177+ Args:
178+ content_hash: SHA256 hash of article content
179+
180+ Returns:
181+ Article dict if found, None otherwise
182+ """
183+ with self .get_connection () as conn :
184+ cur = conn .cursor ()
185+ cur .execute ("SELECT * FROM articles WHERE content_hash = ?" , (content_hash ,))
186+ row = cur .fetchone ()
187+ return dict (row ) if row else None
188+
146189 def save_embedding (self , article_id : str , embedding : bytes , model : str = "default" ) -> bool :
147190 """
148191 Save article embedding.
0 commit comments