-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathget_symfony_titles.py
More file actions
114 lines (81 loc) · 3.45 KB
/
get_symfony_titles.py
File metadata and controls
114 lines (81 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/python
"""
Simple script to get the latest three blog post titles from symfony.com/blog
This script demonstrates web scraping using only Python standard library modules:
- requests for HTTP requests
- re for regular expressions
- html.parser for HTML parsing
Usage: python3 get_symfony_titles.py
"""
import requests
import re
def get_latest_symfony_blog_titles():
"""
Fetch and extract the latest 3 blog post titles from symfony.com/blog
Returns:
list: List of blog post titles (up to 3)
"""
url = 'https://symfony.com/blog'
try:
print("Fetching Symfony blog page...")
response = requests.get(url, timeout=10)
if response.status_code != 200:
print(f"Error: HTTP {response.status_code}")
return []
html_content = response.text
print("Page fetched successfully, extracting titles...")
# Try different patterns to extract blog post titles
patterns = [
# H2 tags with links (common pattern)
r'<h2[^>]*>\s*<a[^>]*>(.*?)</a>\s*</h2>',
# H2 tags without links
r'<h2[^>]*class="[^"]*title[^"]*"[^>]*>(.*?)</h2>',
# Any element with title-related class
r'class="[^"]*(?:post-title|entry-title|blog-title)[^"]*"[^>]*>(.*?)<',
# Article headings
r'<article[^>]*>.*?<h[23][^>]*>(.*?)</h[23]>',
]
titles = []
for pattern in patterns:
matches = re.findall(pattern, html_content, re.IGNORECASE | re.DOTALL)
if matches:
for match in matches:
# Clean the title text
clean_title = re.sub(r'<[^>]+>', '', match).strip()
clean_title = re.sub(r'\s+', ' ', clean_title) # Normalize whitespace
if clean_title and len(clean_title) > 10: # Filter short/empty titles
titles.append(clean_title)
if titles:
break # Stop after finding titles with first working pattern
return titles[:3] # Return only the latest 3
except requests.exceptions.RequestException as e:
print(f"Network error: {e}")
return []
except Exception as e:
print(f"Error: {e}")
return []
def main():
"""Main function to run the blog title extraction"""
print("Symfony Blog Title Extractor")
print("=" * 30)
titles = get_latest_symfony_blog_titles()
if titles:
print(f"\nFound {len(titles)} blog post titles:")
print("-" * 40)
for i, title in enumerate(titles, 1):
print(f"{i}. {title}")
print("-" * 40)
print("✅ Successfully extracted Symfony blog titles!")
else:
print("\n❌ Could not extract blog titles.")
print("\nPossible reasons:")
print("- Network connectivity issues")
print("- Website structure has changed")
print("- Site is blocking automated requests")
print("- Temporary server issues")
print("\nTo fix this issue:")
print("1. Check your internet connection")
print("2. Try running the script again later")
print("3. Inspect the page HTML to update the regex patterns")
if __name__ == "__main__":
main()