-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmain.py
More file actions
executable file
·108 lines (90 loc) · 3.47 KB
/
main.py
File metadata and controls
executable file
·108 lines (90 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
import redis
import requests
import json
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import math
from punctuators.models import PunctCapSegModelONNX
import os
from dotenv import load_dotenv
load_dotenv()
ARGUFLOW_API_KEY = os.environ.get("ARGUFLOW_API_KEY")
ARGUFLOW_API_URL = os.environ.get("ARGUFLOW_API_URL")
REDIS_URL = os.environ.get("REDIS_URL")
class Card:
def __init__(self, card_html, link, metadata_dict, time_stamp):
self.card_html = card_html
self.link = link
self.metadata = metadata_dict
self.time_stamp = time_stamp
if not self.metadata:
print("Missing metadata.")
exit(1)
def to_json(self):
def replace_nan_none(obj):
if isinstance(obj, float) and (obj != obj or obj is None):
return ""
if obj is None:
return ""
if isinstance(obj, dict):
return {key: replace_nan_none(value) for key, value in obj.items()}
if isinstance(obj, list):
return [replace_nan_none(item) for item in obj]
return obj
json_dict = {
key: replace_nan_none(value) for key, value in self.__dict__.items()
}
return json.dumps(json_dict, sort_keys=True, default=str)
def send_post_request(self):
url = f"{ARGUFLOW_API_URL}/card"
payload = self.to_json()
headers = {"Content-Type": "application/json", "Authorization": ARGUFLOW_API_KEY}
req_result = requests.post(url, data=payload, headers=headers)
if req_result.status_code != 200:
req_error = req_result.text
print(req_error)
m: PunctCapSegModelONNX = PunctCapSegModelONNX.from_pretrained(
"1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
)
def pop_in_progress():
lua_script = """
local item = redis.call('spop', KEYS[1])
if item then
redis.call('sadd', KEYS[2], item)
end
return item
"""
keys = ["in-progress", "completed"]
result = r.eval(lua_script, 2, *keys)
return result
r = redis.from_url(url=REDIS_URL, decode_responses=True, db=0)
r.ping()
video_data = pop_in_progress()
while video_data:
video_id = video_data.split("||")[0]
video_title = video_data.split("||")[1]
video_url = f"https://www.youtube.com/watch?v={video_id}"
print(f"Processing {video_id}||{video_title}")
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id=video_id, languages=('en', 'en-US'))
info = YouTube(video_url)
for start in range(0, len(transcript), 30):
end = min(start + 30, len(transcript))
chunk = transcript[start:end]
text = " ".join([i["text"] for i in chunk]).replace("\n", " ")
puncuated_text = m.infer(texts=[text], apply_sbd=False)[0]
metadata = {
"Title": info.title,
"Description": info.description,
"Thumbnail": info.thumbnail_url,
"Channel": info.author,
"Duration": info.length,
"Uploaded At": info.publish_date.strftime("%Y-%m-%d %H:%M:%S"),
}
link = video_url + f"&t={math.floor(chunk[0]['start'])}"
card = Card(puncuated_text, link, metadata, info.publish_date.strftime("%Y-%m-%d %H:%M:%S"))
card.send_post_request()
except Exception as e:
print("Error: " + str(e))
video_data = pop_in_progress()