-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathyoutube-transcript.py
More file actions
86 lines (76 loc) · 3.24 KB
/
youtube-transcript.py
File metadata and controls
86 lines (76 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import sys
import click
from youtube_transcript_api import YouTubeTranscriptApi, FetchedTranscriptSnippet
# Return aggregated lines at the specified interval duration
def lines_by_dur(fetched_transcript, int_dur):
cur_st = 0
cur_text = ""
next_st = int_dur
line_iter = iter(fetched_transcript)
while True:
try:
line_map = next(line_iter)
except StopIteration:
line_map = None
if line_map and line_map.start < next_st:
cur_st = cur_st or line_map.start
cur_text = cur_text + " " + line_map.text
continue
elif cur_text:
res_map = FetchedTranscriptSnippet(start=cur_st, text=cur_text, duration=0)
yield res_map
if not line_map:
break
else:
cur_st = line_map.start
cur_text = line_map.text
next_st = cur_st + int_dur
def lines_by_words(fetched_transcript, word_count):
cur_st = 0
cur_text = ""
cur_wc = 0
line_iter = iter(fetched_transcript)
while True:
try:
line_map = next(line_iter)
except StopIteration:
line_map = None
new_wc = line_map and len(line_map.text.split()) or 0
if line_map and (cur_wc + new_wc) < word_count:
cur_st = cur_st or line_map.start
cur_text = cur_text + " " + line_map.text
cur_wc += new_wc
continue
if line_map and (cur_wc + new_wc) >= word_count:
res_map = FetchedTranscriptSnippet(start=cur_st, text=cur_text, duration=0)
yield res_map
if not line_map:
break
else:
cur_st = line_map.start
cur_text = line_map.text
cur_wc = len(cur_text.split())
@click.command()
@click.option("--interval-duration", "--id", "-d", default=60, help="Interval duration for generating timestamps, use 0 to disable and produce raw output")
@click.option("--word-count", "--wc", "-w", default=0, help="Number of words for generating timestamps, applicable when --interval-duration=0")
@click.option("--timestamp-in-seconds", "--tis", "-s", is_flag=True, default=False, help="Specify to see duration in seconds instead of min:sec")
@click.argument("video-id")
def main(interval_duration, timestamp_in_seconds, word_count, video_id):
# Retrieve the available transcripts
ytt_api = YouTubeTranscriptApi()
fetched_transcript = ytt_api.fetch(video_id)
print("<html><body><dl>")
lines_iter = (interval_duration <= 0 and word_count <= 0) and fetched_transcript or (
interval_duration > 0 and lines_by_dur(fetched_transcript, interval_duration)) or lines_by_words(fetched_transcript, word_count)
for line_map in lines_iter:
link_to_tstmp = f"https://youtu.be/{video_id}?t={int(line_map.start)}"
if timestamp_in_seconds:
tstmp_str = str(int(line_map.start))
else:
st_min = int(line_map.start / 60)
st_sec = int(line_map.start - st_min * 60)
tstmp_str = ("%2d:%-2d" % (st_min, st_sec)).replace(" ", " ")
print("""<dt><a href="%s">%s</a></dt><dd>%s</dd>""" % (link_to_tstmp, tstmp_str, line_map.text))
print("</dl></html></body>")
if __name__ == '__main__':
main()