Skip to content

Commit 6088121

Browse files
committed
Cap timeline recovery attempts
1 parent 5021021 commit 6088121

1 file changed

Lines changed: 38 additions & 5 deletions

File tree

scraper.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -949,7 +949,11 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
949949
print("(İptal etmek için Ctrl+C - toplananlar kaydedilecek)\n")
950950
self.tweets_collected = [] # Instance variable olarak sakla
951951
no_progress_count = 0
952+
no_new_collected_count = 0
953+
recovery_attempts = 0
952954
max_no_progress = 8
955+
max_no_new_collected = 14
956+
max_recovery_attempts = 3
953957
scan_cycles = 0
954958
max_scan_cycles = max(60, count * 8)
955959

@@ -982,13 +986,22 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
982986

983987
scroll_advanced = self._scroll_down()
984988

985-
if collected_after > collected_before or scroll_advanced:
989+
if collected_after > collected_before:
986990
no_progress_count = 0
991+
no_new_collected_count = 0
987992
else:
988-
no_progress_count += 1
989-
990-
if no_progress_count in (2, 4, 6):
991-
print("Timeline takıldı gibi görünüyor, scroll recovery deneniyor...")
993+
no_new_collected_count += 1
994+
if scroll_advanced:
995+
no_progress_count = 0
996+
else:
997+
no_progress_count += 1
998+
999+
if (
1000+
no_progress_count in (2, 4, 6)
1001+
and recovery_attempts < max_recovery_attempts
1002+
):
1003+
recovery_attempts += 1
1004+
print(f"Timeline takıldı gibi görünüyor, scroll recovery deneniyor ({recovery_attempts}/{max_recovery_attempts})...")
9921005
record_event(
9931006
self.run_log,
9941007
"timeline_loading",
@@ -997,10 +1010,28 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
9971010
collected=len(self.tweets_collected),
9981011
target=count,
9991012
no_progress_cycles=no_progress_count,
1013+
no_new_collected_cycles=no_new_collected_count,
1014+
recovery_attempts=recovery_attempts,
10001015
scan_cycles=scan_cycles,
10011016
)
10021017
self._scroll_recovery()
10031018

1019+
if no_new_collected_count >= max_no_new_collected:
1020+
print(f"{max_no_new_collected} tarama turunda yeni tweet parse edilemedi. Kısmi sonuçla duruluyor.")
1021+
record_event(
1022+
self.run_log,
1023+
"timeline_loading",
1024+
"warning",
1025+
"Timeline advanced or scanned but produced no new parsed tweets",
1026+
reason="timeline_empty" if not self.tweets_collected else "partial_target_not_met",
1027+
collected=len(self.tweets_collected),
1028+
target=count,
1029+
no_new_collected_cycles=no_new_collected_count,
1030+
recovery_attempts=recovery_attempts,
1031+
scan_cycles=scan_cycles,
1032+
)
1033+
break
1034+
10041035
if no_progress_count >= max_no_progress:
10051036
print(f"Timeline {max_no_progress} denemede ilerlemedi. Kısmi sonuçla duruluyor.")
10061037
record_event(
@@ -1012,6 +1043,8 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
10121043
collected=len(self.tweets_collected),
10131044
target=count,
10141045
no_progress_cycles=no_progress_count,
1046+
no_new_collected_cycles=no_new_collected_count,
1047+
recovery_attempts=recovery_attempts,
10151048
scan_cycles=scan_cycles,
10161049
)
10171050
break

0 commit comments

Comments
 (0)