-
Notifications
You must be signed in to change notification settings - Fork 2k
Expand file tree
/
Copy pathextract_video_info.py
More file actions
150 lines (120 loc) · 6.15 KB
/
extract_video_info.py
File metadata and controls
150 lines (120 loc) · 6.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import requests
from bs4 import BeautifulSoup
import re
import json
import argparse
def get_video_info(url):
"""
Extract video information from YouTube using modern approach
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
# Download HTML code
response = requests.get(url, headers=headers)
response.raise_for_status()
# Create beautiful soup object to parse HTML
soup = BeautifulSoup(response.text, "html.parser")
# Initialize the result
result = {}
# Extract ytInitialData which contains all the video information
data_match = re.search(r'var ytInitialData = ({.*?});', response.text)
if not data_match:
raise Exception("Could not find ytInitialData in page")
data_json = json.loads(data_match.group(1))
# Get the main content sections
contents = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents']
# Extract video information from videoPrimaryInfoRenderer
if 'videoPrimaryInfoRenderer' in contents[0]:
primary = contents[0]['videoPrimaryInfoRenderer']
# Video title
result["title"] = primary['title']['runs'][0]['text']
# Video views
result["views"] = primary['viewCount']['videoViewCountRenderer']['viewCount']['simpleText']
# Date published
result["date_published"] = primary['dateText']['simpleText']
# Extract channel information from videoSecondaryInfoRenderer
secondary = None
if 'videoSecondaryInfoRenderer' in contents[1]:
secondary = contents[1]['videoSecondaryInfoRenderer']
owner = secondary['owner']['videoOwnerRenderer']
# Channel name
channel_name = owner['title']['runs'][0]['text']
# Channel ID
channel_id = owner['navigationEndpoint']['browseEndpoint']['browseId']
# Channel URL - FIXED with proper /channel/ path
channel_url = f"https://www.youtube.com/channel/{channel_id}"
# Number of subscribers
channel_subscribers = owner['subscriberCountText']['accessibility']['accessibilityData']['label']
result['channel'] = {
'name': channel_name,
'url': channel_url,
'subscribers': channel_subscribers
}
# Extract video description
if secondary and 'attributedDescription' in secondary:
description_runs = secondary['attributedDescription']['content']
result["description"] = description_runs
else:
result["description"] = "Description not available"
# Try to extract video duration from player overlay
# This is a fallback approach since the original method doesn't work
duration_match = re.search(r'"approxDurationMs":"(\d+)"', response.text)
if duration_match:
duration_ms = int(duration_match.group(1))
minutes = duration_ms // 60000
seconds = (duration_ms % 60000) // 1000
result["duration"] = f"{minutes}:{seconds:02d}"
else:
result["duration"] = "Duration not available"
# Extract video tags if available
video_tags = []
if 'keywords' in data_json.get('metadata', {}).get('videoMetadataRenderer', {}):
video_tags = data_json['metadata']['videoMetadataRenderer']['keywords']
result["tags"] = ', '.join(video_tags) if video_tags else "No tags available"
# Extract likes (modern approach)
result["likes"] = "Likes count not available"
result["dislikes"] = "UNKNOWN" # YouTube no longer shows dislikes
# Try to find likes in the new structure
for content in contents:
if 'compositeVideoPrimaryInfoRenderer' in content:
composite = content['compositeVideoPrimaryInfoRenderer']
if 'likeButton' in composite:
like_button = composite['likeButton']
if 'toggleButtonRenderer' in like_button:
toggle = like_button['toggleButtonRenderer']
if 'defaultText' in toggle:
default_text = toggle['defaultText']
if 'accessibility' in default_text:
accessibility = default_text['accessibility']
if 'accessibilityData' in accessibility:
label = accessibility['accessibilityData']['label']
if 'like' in label.lower():
result["likes"] = label
return result
except Exception as e:
raise Exception(f"Error extracting video info: {str(e)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="YouTube Video Data Extractor")
parser.add_argument("url", help="URL of the YouTube video")
args = parser.parse_args()
# parse the video URL from command line
url = args.url
try:
data = get_video_info(url)
# print in nice format
print(f"Title: {data['title']}")
print(f"Views: {data['views']}")
print(f"Published at: {data['date_published']}")
print(f"Video Duration: {data['duration']}")
print(f"Video tags: {data['tags']}")
print(f"Likes: {data['likes']}")
print(f"Dislikes: {data['dislikes']}")
print(f"\nDescription: {data['description']}\n")
print(f"\nChannel Name: {data['channel']['name']}")
print(f"Channel URL: {data['channel']['url']}")
print(f"Channel Subscribers: {data['channel']['subscribers']}")
except Exception as e:
print(f"Error: {e}")
print("\nNote: YouTube frequently changes its structure, so this script may need updates.")