@@ -758,20 +758,82 @@ def _get_article_content(self, tweet_url: str) -> str:
758758 pass
759759
760760 def _scroll_down (self ):
761- """Sayfayı aşağı kaydır ve yeni içerik yüklenmesini bekle"""
762- # Scroll öncesi tweet sayısı
763- old_count = len (self .driver .find_elements (By .XPATH , XPATHS ["tweet_article" ]))
761+ """Sayfayı aşağı kaydır ve X'in sanal timeline DOM'unu tetikle."""
762+ old_articles = self .driver .find_elements (By .XPATH , XPATHS ["tweet_article" ])
763+ old_count = len (old_articles )
764+ old_ids = self ._get_article_ids_fast (old_articles )
765+
766+ # X aynı sayıda article tutup içerikleri değiştirebildiği için sadece
767+ # article sayısına veya scroll height'a bakmak erken "sayfa sonu" üretir.
768+ try :
769+ if old_articles :
770+ self .driver .execute_script (
771+ "arguments[0].scrollIntoView({block: 'end', behavior: 'instant'});" ,
772+ old_articles [- 1 ],
773+ )
774+ time .sleep (0.25 )
775+ except Exception :
776+ pass
777+
778+ try :
779+ for _ in range (3 ):
780+ self .driver .execute_script ("window.scrollBy(0, window.innerHeight);" )
781+ time .sleep (0.2 )
782+ except Exception :
783+ self .driver .execute_script ("window.scrollBy(0, 1400);" )
784+
785+ try :
786+ body = self .driver .find_element (By .TAG_NAME , "body" )
787+ body .send_keys (Keys .PAGE_DOWN )
788+ time .sleep (0.2 )
789+ except Exception :
790+ pass
764791
765- # Scroll yap
766- self .driver .execute_script ("window.scrollBy(0, 1000);" )
767792 time .sleep (random .uniform (SCROLL_PAUSE_MIN , SCROLL_PAUSE_MAX ))
768793
769- # Yeni tweet yüklenmesini bekle (max 5 saniye)
770- for _ in range (10 ):
771- new_count = len (self .driver .find_elements (By .XPATH , XPATHS ["tweet_article" ]))
772- if new_count > old_count :
773- break
774- time .sleep (0.5 )
794+ for _ in range (18 ):
795+ new_articles = self .driver .find_elements (By .XPATH , XPATHS ["tweet_article" ])
796+ new_ids = self ._get_article_ids_fast (new_articles )
797+ if len (new_articles ) > old_count or (new_ids - old_ids - self .collected_tweet_ids ):
798+ return True
799+ time .sleep (0.35 )
800+
801+ return False
802+
803+ def _get_article_ids_fast (self , articles ) -> set :
804+ """Mevcut DOM article elementlerinden tweet ID'lerini hızlı çıkar."""
805+ ids = set ()
806+ for article in articles :
807+ try :
808+ time_element = article .find_element (By .TAG_NAME , "time" )
809+ parent_link = time_element .find_element (By .XPATH , "./ancestor::a" )
810+ href = parent_link .get_attribute ("href" )
811+ if href and "/status/" in href :
812+ tweet_id = href .split ("/status/" )[- 1 ].split ("?" )[0 ].split ("/" )[0 ]
813+ if tweet_id :
814+ ids .add (tweet_id )
815+ except Exception :
816+ continue
817+ return ids
818+
819+ def _scroll_recovery (self ):
820+ """Timeline takıldığında daha güçlü native scroll denemeleri yap."""
821+ try :
822+ body = self .driver .find_element (By .TAG_NAME , "body" )
823+ body .click ()
824+ for _ in range (6 ):
825+ body .send_keys (Keys .PAGE_DOWN )
826+ time .sleep (0.35 )
827+ body .send_keys (Keys .END )
828+ time .sleep (1.0 )
829+ except Exception :
830+ pass
831+
832+ try :
833+ self .driver .execute_script ("window.scrollBy(0, document.documentElement.clientHeight * 4);" )
834+ time .sleep (1.0 )
835+ except Exception :
836+ pass
775837
776838 def _scroll_to_bottom (self ):
777839 """Sayfanın en altına git"""
@@ -791,16 +853,12 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
791853 print (f"{ count } tweet toplanıyor..." )
792854 print ("(İptal etmek için Ctrl+C - toplananlar kaydedilecek)\n " )
793855 self .tweets_collected = [] # Instance variable olarak sakla
794- stale_scroll_count = 0 # Scroll yapıp DOM'da yeni article gelmeyen sayı
795- max_stale_scrolls = 10 # Ardışık 10 scroll'da DOM'da yeni element yoksa dur
796- last_height = 0
797- same_height_count = 0
856+ no_new_tweets_count = 0
857+ max_no_new_tweets = 25
798858
799859 try :
800860 while len (self .tweets_collected ) < count :
801- # Scroll öncesi DOM'daki article sayısı
802- articles_before = len (self .driver .find_elements (By .XPATH , XPATHS ["tweet_article" ]))
803-
861+ collected_before = len (self .tweets_collected )
804862 # Mevcut tweetleri topla
805863 articles = self .driver .find_elements (By .XPATH , XPATHS ["tweet_article" ])
806864
@@ -820,41 +878,42 @@ def scrape_by_count(self, count: int) -> List[Tweet]:
820878 show_more_tag = " [SHOW MORE]" if tweet .needs_full_text else ""
821879 print (f" [{ len (self .tweets_collected )} /{ count } ] Tweet toplandı: { tweet .date_str } { article_tag } { show_more_tag } " )
822880
823- # Aşağı kaydır
824- self ._scroll_down ()
825-
826- # Scroll sonrası DOM'daki article sayısı
827- articles_after = len (self .driver .find_elements (By .XPATH , XPATHS ["tweet_article" ]))
828-
829- # Sayfa sonu tespiti: scroll height değişmedi mi?
830- new_height = self .driver .execute_script ("return document.body.scrollHeight" )
831- if new_height == last_height :
832- same_height_count += 1
881+ collected_after = len (self .tweets_collected )
882+ if collected_after > collected_before :
883+ no_new_tweets_count = 0
833884 else :
834- same_height_count = 0
835- last_height = new_height
885+ no_new_tweets_count += 1
836886
837- # DOM'da yeni article geldi mi?
838- if articles_after <= articles_before and same_height_count >= 3 :
839- stale_scroll_count += 1
840- # Ekstra bekleme ile bir şans daha ver
841- if stale_scroll_count <= 3 :
842- time .sleep (3 )
843- else :
844- stale_scroll_count = 0
887+ if no_new_tweets_count in (5 , 12 , 18 ):
888+ print ("Timeline takıldı gibi görünüyor, scroll recovery deneniyor..." )
889+ record_event (
890+ self .run_log ,
891+ "timeline_loading" ,
892+ "warning" ,
893+ "Timeline produced no new parsed tweets; trying scroll recovery" ,
894+ collected = len (self .tweets_collected ),
895+ target = count ,
896+ no_new_cycles = no_new_tweets_count ,
897+ )
898+ self ._scroll_recovery ()
845899
846- if stale_scroll_count >= max_stale_scrolls :
847- print ("Sayfa sonuna ulaşıldı, daha fazla tweet yüklenmiyor ." )
900+ if no_new_tweets_count >= max_no_new_tweets :
901+ print (f" { max_no_new_tweets } scroll denemesinden sonra yeni tweet gelmedi. Kısmi sonuçla duruluyor ." )
848902 record_event (
849903 self .run_log ,
850904 "timeline_loading" ,
851905 "warning" ,
852- "Timeline stopped loading new tweet articles " ,
906+ "Timeline stopped producing new parsed tweets after recovery attempts " ,
853907 reason = "timeline_empty" if not self .tweets_collected else None ,
854908 collected = len (self .tweets_collected ),
909+ target = count ,
910+ no_new_cycles = no_new_tweets_count ,
855911 )
856912 break
857913
914+ # Aşağı kaydır
915+ self ._scroll_down ()
916+
858917 except KeyboardInterrupt :
859918 print (f"\n \n Durduruldu! { len (self .tweets_collected )} tweet toplandı." )
860919 raise # Ana programa ilet
0 commit comments