Skip to content

Commit c2ac975

Browse files
Fix URLs
1 parent 5cc0194 commit c2ac975

3 files changed

Lines changed: 146 additions & 1 deletion

File tree

src/tweet.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ class Tweet:
5454
hashtags: list[str]
5555
title: str
5656
media_types: list[str]
57+
created_at: str = ""
58+
likes: int = 0
59+
retweets: int = 0
60+
replies: int = 0
61+
views: int = 0
5762

5863
def to_dict(self) -> dict:
5964
"""Serialize to a plain dict safe for JSON."""

src/xclient.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,16 @@ def strip_trailing_tco(text: str) -> str:
139139
return _RE_TRAILING_TCO.sub("", text)
140140

141141

142+
def expand_tco_urls(text: str, url_entities: list[dict]) -> str:
143+
"""Replace t.co shortlinks with their expanded URLs using entities.urls."""
144+
for entry in url_entities:
145+
short = entry.get("url", "")
146+
expanded = entry.get("expanded_url", "")
147+
if short and expanded:
148+
text = text.replace(short, expanded)
149+
return text
150+
151+
142152
class XTimelineClient:
143153
"""
144154
Minimal client for polling an X/Twitter timeline endpoint described by a cURL.
@@ -528,6 +538,8 @@ def _parse_single_tweet(
528538
# text & entities
529539
legacy = tw.get("legacy", {})
530540
text = legacy.get("full_text", "")
541+
url_entities = legacy.get("entities", {}).get("urls", [])
542+
text = expand_tco_urls(text, url_entities)
531543
text = unescape_entities(strip_trailing_tco(text))
532544

533545
tickers = [t.upper() for t in self._entities(tw, "symbols") if t]
@@ -543,6 +555,19 @@ def _parse_single_tweet(
543555
user_img = self._user_field(tw, "profile_image_url_https")
544556
url = self._tweet_url(tid)
545557

558+
# engagement metrics
559+
raw_created_at = legacy.get("created_at", "")
560+
try:
561+
created_at = dt.datetime.strptime(
562+
raw_created_at, "%a %b %d %H:%M:%S +0000 %Y"
563+
).strftime("%Y-%m-%dT%H:%M:%SZ")
564+
except (ValueError, TypeError):
565+
created_at = raw_created_at
566+
likes = int(legacy.get("favorite_count", 0) or 0)
567+
retweets = int(legacy.get("retweet_count", 0) or 0)
568+
replies = int(legacy.get("reply_count", 0) or 0)
569+
views = int(tw.get("views", {}).get("count", 0) or 0)
570+
546571
# media
547572
media_items, media_types = self._collect_media(tw)
548573

@@ -615,6 +640,11 @@ def _parse_nested(n: dict) -> Tweet | None:
615640
hashtags=sorted(set(hashtags)),
616641
title=title,
617642
media_types=[m.type for m in uniq_media],
643+
created_at=created_at,
644+
likes=likes,
645+
retweets=retweets,
646+
replies=replies,
647+
views=views,
618648
)
619649

620650
# ---------- public APIs ----------

tests/test_xclient.py

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import pytest
1111

12-
from xclient import XTimelineClient
12+
from xclient import XTimelineClient, expand_tco_urls
1313

1414
# ---------------------------------------------------------------------------
1515
# Helpers
@@ -131,6 +131,22 @@ def _wrap_user(name: str, screen_name: str, image_url: str = "") -> dict:
131131
},
132132
}
133133

134+
TWEET_WITH_METRICS: dict = {
135+
"__typename": "Tweet",
136+
"rest_id": "2039000000000000003",
137+
"core": _wrap_user("Someone", "someone"),
138+
"views": {"count": "24248", "state": "Enabled"},
139+
"legacy": {
140+
"id_str": "2039000000000000003",
141+
"full_text": "Some tweet",
142+
"created_at": "Wed Apr 01 19:15:49 +0000 2026",
143+
"favorite_count": 120,
144+
"retweet_count": 45,
145+
"reply_count": 10,
146+
"entities": {"hashtags": [], "symbols": []},
147+
},
148+
}
149+
134150
TWEET_WITH_TICKERS_AND_HASHTAGS: dict = {
135151
"__typename": "Tweet",
136152
"rest_id": "2039000000000000002",
@@ -297,6 +313,100 @@ def test_media_types_mirror_media(self, client):
297313
# Tickers and hashtags
298314
# ---------------------------------------------------------------------------
299315

316+
class TestMetrics:
317+
def test_created_at_parsed_to_iso(self, client):
318+
t = _parse(client, TWEET_WITH_METRICS)
319+
assert t.created_at == "2026-04-01T19:15:49Z"
320+
321+
def test_likes(self, client):
322+
t = _parse(client, TWEET_WITH_METRICS)
323+
assert t.likes == 120
324+
325+
def test_retweets(self, client):
326+
t = _parse(client, TWEET_WITH_METRICS)
327+
assert t.retweets == 45
328+
329+
def test_replies(self, client):
330+
t = _parse(client, TWEET_WITH_METRICS)
331+
assert t.replies == 10
332+
333+
def test_views(self, client):
334+
t = _parse(client, TWEET_WITH_METRICS)
335+
assert t.views == 24248
336+
337+
def test_missing_metrics_default_to_zero(self, client):
338+
t = _parse(client, PLAIN_TWEET)
339+
assert t.likes == 0
340+
assert t.retweets == 0
341+
assert t.views == 0
342+
343+
def test_missing_created_at_defaults_to_empty(self, client):
344+
t = _parse(client, PLAIN_TWEET)
345+
assert t.created_at == ""
346+
347+
348+
TWEET_WITH_URL: dict = {
349+
"__typename": "Tweet",
350+
"rest_id": "2039000000000000004",
351+
"core": _wrap_user("FlappyBert", "flappybert"),
352+
"legacy": {
353+
"id_str": "2039000000000000004",
354+
"full_text": "Play now 👉 https://t.co/tOjx6u4o0o",
355+
"entities": {
356+
"hashtags": [],
357+
"symbols": [],
358+
"urls": [
359+
{
360+
"url": "https://t.co/tOjx6u4o0o",
361+
"expanded_url": "http://t.me/FlappyBertBot",
362+
"display_url": "t.me/FlappyBertBot",
363+
}
364+
],
365+
},
366+
},
367+
}
368+
369+
TWEET_WITH_MEDIA_URL: dict = {
370+
"__typename": "Tweet",
371+
"rest_id": "2039000000000000005",
372+
"core": _wrap_user("Someone", "someone"),
373+
"legacy": {
374+
"id_str": "2039000000000000005",
375+
"full_text": "Check this out https://t.co/medialink",
376+
# media t.co links are NOT in entities.urls — only in entities.media
377+
"entities": {"hashtags": [], "symbols": [], "urls": []},
378+
"extended_entities": {
379+
"media": [
380+
{"media_url_https": "https://pbs.twimg.com/media/photo.jpg", "type": "photo"}
381+
]
382+
},
383+
},
384+
}
385+
386+
387+
class TestUrlExpansion:
388+
def test_tco_replaced_with_expanded(self, client):
389+
t = _parse(client, TWEET_WITH_URL)
390+
assert "http://t.me/FlappyBertBot" in t.text
391+
assert "t.co" not in t.text
392+
393+
def test_media_tco_still_stripped(self, client):
394+
# media t.co has no entities.urls entry, so strip_trailing_tco removes it
395+
t = _parse(client, TWEET_WITH_MEDIA_URL)
396+
assert "t.co" not in t.text
397+
398+
def test_expand_tco_urls_replaces_all_occurrences(self):
399+
entities = [{"url": "https://t.co/abc", "expanded_url": "https://example.com"}]
400+
result = expand_tco_urls("see https://t.co/abc and https://t.co/abc", entities)
401+
assert result == "see https://example.com and https://example.com"
402+
403+
def test_expand_tco_urls_ignores_missing_fields(self):
404+
entities = [{"url": "", "expanded_url": "https://example.com"}, {"url": "https://t.co/abc"}]
405+
# should not raise, text unchanged for invalid entries
406+
result = expand_tco_urls("https://t.co/abc", entities)
407+
assert result == "https://t.co/abc"
408+
409+
300410
class TestEntities:
301411
def test_ticker_uppercased(self, client):
302412
t = _parse(client, TWEET_WITH_TICKERS_AND_HASHTAGS)

0 commit comments

Comments
 (0)