Skip to content

Commit 6a98ca8

Browse files
committed
Fix timeline scroll early stop
1 parent fa661e6 commit 6a98ca8

6 files changed

Lines changed: 194 additions & 81 deletions

File tree

main.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,24 @@ def main():
464464

465465
print()
466466
print("=" * 60)
467-
print("TAMAMLANDI!")
467+
partial_count = (
468+
mode.get("mode") == "count"
469+
and len(tweets) < mode.get("count", len(tweets))
470+
)
471+
if partial_count:
472+
print("KISMİ TAMAMLANDI!")
473+
print(f"İstenen: {mode['count']} tweet, toplanan: {len(tweets)} tweet.")
474+
print("Timeline daha fazla yeni tweet yüklemedi; run log detaylarına bakın.")
475+
record_event(
476+
run_log,
477+
"timeline_loading",
478+
"warning",
479+
"Count scrape ended before requested tweet count",
480+
collected=len(tweets),
481+
target=mode["count"],
482+
)
483+
else:
484+
print("TAMAMLANDI!")
468485
print(f"Toplam {len(tweets)} tweet toplandı.")
469486
print(f"Dosya: {output_path}")
470487
print("=" * 60)
@@ -477,7 +494,7 @@ def main():
477494
format=output_format,
478495
total_tweets=len(tweets),
479496
)
480-
save_cli_run_log(run_log, "completed")
497+
save_cli_run_log(run_log, "partial" if partial_count else "completed")
481498

482499
# Devam etmek istiyor mu?
483500
if ask_continue():

python_sidecar/scraper.py

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1519,10 +1519,6 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
15191519
self._emit("log", level="info", message=f"Collecting {count} tweets...")
15201520
self.tweets_collected = []
15211521
self._skipped_tweet_ids = set()
1522-
stale_scroll_count = 0
1523-
max_stale_scrolls = 15
1524-
last_height = 0
1525-
same_height_count = 0
15261522
no_new_tweets_count = 0
15271523
max_no_new_tweets = 20
15281524

@@ -1673,38 +1669,6 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
16731669
)
16741670
self._scroll_down()
16751671

1676-
new_height = self.driver.execute_script(
1677-
"return document.body.scrollHeight"
1678-
)
1679-
if new_height == last_height:
1680-
same_height_count += 1
1681-
else:
1682-
same_height_count = 0
1683-
last_height = new_height
1684-
1685-
if same_height_count >= 3:
1686-
stale_scroll_count += 1
1687-
if stale_scroll_count <= 3:
1688-
time.sleep(3)
1689-
else:
1690-
stale_scroll_count = 0
1691-
1692-
if stale_scroll_count >= max_stale_scrolls:
1693-
self._emit(
1694-
"log",
1695-
level="info",
1696-
message=f"End of timeline reached. Found {collected_after} of {count} requested tweets.",
1697-
)
1698-
record_event(
1699-
self.run_log,
1700-
"timeline_loading",
1701-
"warning",
1702-
"Timeline stopped loading new tweet articles",
1703-
reason="timeline_empty" if not self.tweets_collected else None,
1704-
collected=len(self.tweets_collected),
1705-
)
1706-
break
1707-
17081672
except KeyboardInterrupt:
17091673
self._emit(
17101674
"log",

python_sidecar/service.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,20 @@ def _do_scrape():
222222
"Scrape completed without collected tweets",
223223
reason="timeline_empty",
224224
)
225+
partial_count = (
226+
mode == "count"
227+
and cmd.get("count") is not None
228+
and len(tweets) < int(cmd.get("count", len(tweets)))
229+
)
230+
if partial_count:
231+
record_event(
232+
self.current_run_log,
233+
"timeline_loading",
234+
"warning",
235+
"Count scrape ended before requested tweet count",
236+
collected=len(tweets),
237+
target=cmd.get("count"),
238+
)
225239

226240
# Send complete event FIRST (lightweight, won't block pipe)
227241
# Tweet updates are large and can block stdout pipe via IPC backpressure
@@ -234,8 +248,11 @@ def _do_scrape():
234248
failure_reason=self.current_run_log.failure_reason
235249
if self.current_run_log
236250
else None,
251+
partial=partial_count,
252+
)
253+
self.save_current_run_log(
254+
"partial" if partial_count else "completed" if tweets else "failed"
237255
)
238-
self.save_current_run_log("completed" if tweets else "failed")
239256
self.emit(
240257
"log",
241258
level="info",

scraper.py

Lines changed: 100 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -758,20 +758,82 @@ def _get_article_content(self, tweet_url: str) -> str:
758758
pass
759759

760760
def _scroll_down(self):
761-
"""Sayfayı aşağı kaydır ve yeni içerik yüklenmesini bekle"""
762-
# Scroll öncesi tweet sayısı
763-
old_count = len(self.driver.find_elements(By.XPATH, XPATHS["tweet_article"]))
761+
"""Sayfayı aşağı kaydır ve X'in sanal timeline DOM'unu tetikle."""
762+
old_articles = self.driver.find_elements(By.XPATH, XPATHS["tweet_article"])
763+
old_count = len(old_articles)
764+
old_ids = self._get_article_ids_fast(old_articles)
765+
766+
# X aynı sayıda article tutup içerikleri değiştirebildiği için sadece
767+
# article sayısına veya scroll height'a bakmak erken "sayfa sonu" üretir.
768+
try:
769+
if old_articles:
770+
self.driver.execute_script(
771+
"arguments[0].scrollIntoView({block: 'end', behavior: 'instant'});",
772+
old_articles[-1],
773+
)
774+
time.sleep(0.25)
775+
except Exception:
776+
pass
777+
778+
try:
779+
for _ in range(3):
780+
self.driver.execute_script("window.scrollBy(0, window.innerHeight);")
781+
time.sleep(0.2)
782+
except Exception:
783+
self.driver.execute_script("window.scrollBy(0, 1400);")
784+
785+
try:
786+
body = self.driver.find_element(By.TAG_NAME, "body")
787+
body.send_keys(Keys.PAGE_DOWN)
788+
time.sleep(0.2)
789+
except Exception:
790+
pass
764791

765-
# Scroll yap
766-
self.driver.execute_script("window.scrollBy(0, 1000);")
767792
time.sleep(random.uniform(SCROLL_PAUSE_MIN, SCROLL_PAUSE_MAX))
768793

769-
# Yeni tweet yüklenmesini bekle (max 5 saniye)
770-
for _ in range(10):
771-
new_count = len(self.driver.find_elements(By.XPATH, XPATHS["tweet_article"]))
772-
if new_count > old_count:
773-
break
774-
time.sleep(0.5)
794+
for _ in range(18):
795+
new_articles = self.driver.find_elements(By.XPATH, XPATHS["tweet_article"])
796+
new_ids = self._get_article_ids_fast(new_articles)
797+
if len(new_articles) > old_count or (new_ids - old_ids - self.collected_tweet_ids):
798+
return True
799+
time.sleep(0.35)
800+
801+
return False
802+
803+
def _get_article_ids_fast(self, articles) -> set:
804+
"""Mevcut DOM article elementlerinden tweet ID'lerini hızlı çıkar."""
805+
ids = set()
806+
for article in articles:
807+
try:
808+
time_element = article.find_element(By.TAG_NAME, "time")
809+
parent_link = time_element.find_element(By.XPATH, "./ancestor::a")
810+
href = parent_link.get_attribute("href")
811+
if href and "/status/" in href:
812+
tweet_id = href.split("/status/")[-1].split("?")[0].split("/")[0]
813+
if tweet_id:
814+
ids.add(tweet_id)
815+
except Exception:
816+
continue
817+
return ids
818+
819+
def _scroll_recovery(self):
820+
"""Timeline takıldığında daha güçlü native scroll denemeleri yap."""
821+
try:
822+
body = self.driver.find_element(By.TAG_NAME, "body")
823+
body.click()
824+
for _ in range(6):
825+
body.send_keys(Keys.PAGE_DOWN)
826+
time.sleep(0.35)
827+
body.send_keys(Keys.END)
828+
time.sleep(1.0)
829+
except Exception:
830+
pass
831+
832+
try:
833+
self.driver.execute_script("window.scrollBy(0, document.documentElement.clientHeight * 4);")
834+
time.sleep(1.0)
835+
except Exception:
836+
pass
775837

776838
def _scroll_to_bottom(self):
777839
"""Sayfanın en altına git"""
@@ -791,16 +853,12 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
791853
print(f"{count} tweet toplanıyor...")
792854
print("(İptal etmek için Ctrl+C - toplananlar kaydedilecek)\n")
793855
self.tweets_collected = [] # Instance variable olarak sakla
794-
stale_scroll_count = 0 # Scroll yapıp DOM'da yeni article gelmeyen sayı
795-
max_stale_scrolls = 10 # Ardışık 10 scroll'da DOM'da yeni element yoksa dur
796-
last_height = 0
797-
same_height_count = 0
856+
no_new_tweets_count = 0
857+
max_no_new_tweets = 25
798858

799859
try:
800860
while len(self.tweets_collected) < count:
801-
# Scroll öncesi DOM'daki article sayısı
802-
articles_before = len(self.driver.find_elements(By.XPATH, XPATHS["tweet_article"]))
803-
861+
collected_before = len(self.tweets_collected)
804862
# Mevcut tweetleri topla
805863
articles = self.driver.find_elements(By.XPATH, XPATHS["tweet_article"])
806864

@@ -820,41 +878,42 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
820878
show_more_tag = " [SHOW MORE]" if tweet.needs_full_text else ""
821879
print(f" [{len(self.tweets_collected)}/{count}] Tweet toplandı: {tweet.date_str}{article_tag}{show_more_tag}")
822880

823-
# Aşağı kaydır
824-
self._scroll_down()
825-
826-
# Scroll sonrası DOM'daki article sayısı
827-
articles_after = len(self.driver.find_elements(By.XPATH, XPATHS["tweet_article"]))
828-
829-
# Sayfa sonu tespiti: scroll height değişmedi mi?
830-
new_height = self.driver.execute_script("return document.body.scrollHeight")
831-
if new_height == last_height:
832-
same_height_count += 1
881+
collected_after = len(self.tweets_collected)
882+
if collected_after > collected_before:
883+
no_new_tweets_count = 0
833884
else:
834-
same_height_count = 0
835-
last_height = new_height
885+
no_new_tweets_count += 1
836886

837-
# DOM'da yeni article geldi mi?
838-
if articles_after <= articles_before and same_height_count >= 3:
839-
stale_scroll_count += 1
840-
# Ekstra bekleme ile bir şans daha ver
841-
if stale_scroll_count <= 3:
842-
time.sleep(3)
843-
else:
844-
stale_scroll_count = 0
887+
if no_new_tweets_count in (5, 12, 18):
888+
print("Timeline takıldı gibi görünüyor, scroll recovery deneniyor...")
889+
record_event(
890+
self.run_log,
891+
"timeline_loading",
892+
"warning",
893+
"Timeline produced no new parsed tweets; trying scroll recovery",
894+
collected=len(self.tweets_collected),
895+
target=count,
896+
no_new_cycles=no_new_tweets_count,
897+
)
898+
self._scroll_recovery()
845899

846-
if stale_scroll_count >= max_stale_scrolls:
847-
print("Sayfa sonuna ulaşıldı, daha fazla tweet yüklenmiyor.")
900+
if no_new_tweets_count >= max_no_new_tweets:
901+
print(f"{max_no_new_tweets} scroll denemesinden sonra yeni tweet gelmedi. Kısmi sonuçla duruluyor.")
848902
record_event(
849903
self.run_log,
850904
"timeline_loading",
851905
"warning",
852-
"Timeline stopped loading new tweet articles",
906+
"Timeline stopped producing new parsed tweets after recovery attempts",
853907
reason="timeline_empty" if not self.tweets_collected else None,
854908
collected=len(self.tweets_collected),
909+
target=count,
910+
no_new_cycles=no_new_tweets_count,
855911
)
856912
break
857913

914+
# Aşağı kaydır
915+
self._scroll_down()
916+
858917
except KeyboardInterrupt:
859918
print(f"\n\nDurduruldu! {len(self.tweets_collected)} tweet toplandı.")
860919
raise # Ana programa ilet

src-tauri/src/commands/export.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ fn safe_filename(filename: &str) -> String {
149149
.unwrap_or("export")
150150
.to_string();
151151

152-
let stem = PathBuf::from(&name)
152+
let name_path = PathBuf::from(&name);
153+
let stem = name_path
153154
.file_stem()
154155
.and_then(|v| v.to_str())
155156
.unwrap_or("export");

tests/test_scroll_helpers.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import unittest
2+
3+
from selenium.webdriver.common.by import By
4+
5+
from scraper import XScraper
6+
7+
8+
class FakeLink:
9+
def __init__(self, href):
10+
self.href = href
11+
12+
def get_attribute(self, name):
13+
return self.href if name == "href" else None
14+
15+
16+
class FakeTime:
17+
def __init__(self, href):
18+
self.href = href
19+
20+
def find_element(self, by, selector):
21+
if by == By.XPATH and selector == "./ancestor::a":
22+
return FakeLink(self.href)
23+
raise LookupError(selector)
24+
25+
26+
class FakeArticle:
27+
def __init__(self, href):
28+
self.href = href
29+
30+
def find_element(self, by, selector):
31+
if by == By.TAG_NAME and selector == "time":
32+
return FakeTime(self.href)
33+
raise LookupError(selector)
34+
35+
36+
class ScrollHelperTests(unittest.TestCase):
37+
def test_get_article_ids_fast_extracts_status_ids(self):
38+
scraper = XScraper(headless=True)
39+
articles = [
40+
FakeArticle("https://x.com/user/status/111"),
41+
FakeArticle("https://x.com/user/status/222?ref=profile"),
42+
FakeArticle("https://x.com/user/status/111"),
43+
FakeArticle("https://x.com/user/with_replies"),
44+
]
45+
46+
self.assertEqual(scraper._get_article_ids_fast(articles), {"111", "222"})
47+
48+
def test_get_article_ids_fast_ignores_bad_articles(self):
49+
scraper = XScraper(headless=True)
50+
51+
self.assertEqual(scraper._get_article_ids_fast([object()]), set())
52+
53+
54+
if __name__ == "__main__":
55+
unittest.main()

0 commit comments

Comments
 (0)