-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathcache.py
More file actions
167 lines (148 loc) · 6.09 KB
/
Copy pathcache.py
File metadata and controls
167 lines (148 loc) · 6.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from collections import OrderedDict
from dataclasses import dataclass
import logging
import os
import uuid
import json
import time
from pytubefix import YouTube
from modules.metrics import MetricsHandler
from urllib.parse import urlparse, parse_qs
@dataclass
class VideoInfo:
file_path: str
thumbnail: str
title: str
size_bytes: int
def __str__(self):
return f"VideoInfo(video_id={self.video_id}, file_path={self.file_path}, size_bytes={self.size_bytes})"
class Cache:
def __init__(
self,
file_path: str,
cache_file: str = None,
max_size_bytes: int = 2_000_000_000,
) -> None:
self.file_path = file_path
self.max_size_bytes = max_size_bytes
self.current_size_bytes = 0
self.cache_file = cache_file
self.video_id_to_path = OrderedDict()
def add(self, url: str):
video = YouTube(url)
# Download video of set resolution
video = (
video.streams.filter(
resolution="360p",
progressive=True,
)
.order_by("resolution")
.desc()
.first()
)
if video.filesize > self.max_size_bytes:
logging.info(
f"Video size ({video.filesize} bytes) exceeds max cache size ({self.max_size_bytes} bytes). Caching cancelled."
)
return None
if self.current_size_bytes + video.filesize > self.max_size_bytes:
target_bytes = self.max_size_bytes - video.filesize
self._downsize_cache_to_target_bytes(target_bytes)
MetricsHandler.cache_size.set(len(self.video_id_to_path))
MetricsHandler.cache_size_bytes.set(self.current_size_bytes)
video_file_name = video.default_filename
start_time = time.time()
with MetricsHandler.download_time.time():
video.download(self.file_path)
end_time = time.time()
MetricsHandler.data_downloaded.inc(video.filesize)
MetricsHandler.video_download_count.inc()
video_id = self.get_video_id(url)
video_file_name = str(uuid.uuid4()) + ".mp4"
video_file_path = os.path.join(self.file_path, video_file_name)
os.rename(
os.path.join(self.file_path, video.default_filename),
video_file_path,
)
logging.info(f"downloaded {url} to path {video_file_path}")
video_info = VideoInfo(
file_path=video_file_path,
thumbnail=YouTube(url).thumbnail_url,
title=YouTube(url).title,
size_bytes=video.filesize,
)
self.video_id_to_path[video_id] = video_info
self.current_size_bytes += video_info.size_bytes
MetricsHandler.cache_size.set(len(self.video_id_to_path))
MetricsHandler.cache_size_bytes.set(self.current_size_bytes)
#download rate are currently listed in bytes / second
download_rate = (video.filesize / (end_time-start_time))
MetricsHandler.download_rate.observe(download_rate)
def find(self, video_id: str):
if video_id in self.video_id_to_path:
self.video_id_to_path.move_to_end(video_id)
MetricsHandler.cache_hit_count.inc()
return self.video_id_to_path[video_id].file_path
MetricsHandler.cache_miss_count.inc()
return None
def _downsize_cache_to_target_bytes(self, target_bytes: int):
logging.info(
f"current size {self.current_size_bytes}, downsizing to {target_bytes}"
)
while self.current_size_bytes > target_bytes:
removed_video_info = self.video_id_to_path.popitem(last=False)[1]
self.current_size_bytes -= removed_video_info.size_bytes
os.remove(removed_video_info.file_path)
def clear(self):
self._downsize_cache_to_target_bytes(0)
def populate_cache(self):
try:
# open the file and read the data
with open(self.cache_file, "r") as f:
# json.load converts the json data into python dictionary
dict_data = json.load(f)
# populate the cache
for video_key, video_info in dict_data.items():
if not os.path.exists(video_info["file_path"]):
logging.info(f"{video_info['file_path']} was not found on disk")
continue
self.video_id_to_path[video_key] = VideoInfo(
file_path=video_info["file_path"],
thumbnail=video_info["thumbnail"],
title=video_info["title"],
size_bytes=video_info["size_bytes"],
)
self.current_size_bytes += video_info["size_bytes"]
MetricsHandler.cache_size.set(len(self.video_id_to_path))
MetricsHandler.cache_size_bytes.set(self.current_size_bytes)
logging.info(
f"Read {len(self.video_id_to_path)} items from cache file {self.cache_file}"
)
except Exception:
logging.exception(f"unable to read cache data from {self.cache_file}")
def write_cache(self):
try:
# cache state
cache_state = {}
for video_id, video_info in self.video_id_to_path.items():
cache_state[video_id] = {
"file_path": video_info.file_path,
"thumbnail": video_info.thumbnail,
"title": video_info.title,
"size_bytes": video_info.size_bytes,
}
# serializing json
json_data = json.dumps(cache_state, indent=4)
# open the file and write the data
with open(self.cache_file, "w") as f:
f.write(json_data)
logging.info(
f"Wrote {len(cache_state)} items to cache file {self.cache_file}"
)
except Exception:
logging.exception(f"unable to write cache data to {self.cache_file}")
@staticmethod
def get_video_id(url) -> str:
parsed_url = urlparse(url)
video_id = parse_qs(parsed_url.query)["v"][0]
return video_id