Skip to content

Commit 033ef53

Browse files
committed
Even more reliable latest updates search + BUILD
1 parent b588f8f commit 033ef53

4 files changed

Lines changed: 122 additions & 111 deletions

File tree

indexer/f95zone.py

Lines changed: 63 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import datetime as dt
55
import logging
66
import os
7+
import re
78
import sys
89

910
import aiohttp
@@ -47,52 +48,17 @@
4748

4849
HOST = "https://f95zone.to"
4950
THREAD_URL = f"{HOST}/threads/{{thread}}"
50-
VERCHK_URL = f"{HOST}/sam/checker.php?threads={{threads}}"
51-
SEARCH_URL = f"{HOST}/sam/latest_alpha/latest_data.php?cmd={{cmd}}&cat={{cat}}&page={{page}}&{{search}}={{query}}&sort={{sort}}&rows={{rows}}&_={{ts}}"
52-
LATEST_URL = f"{HOST}/sam/latest_alpha/latest_data.php?cmd={{cmd}}&cat={{cat}}&page={{page}}&sort={{sort}}&rows={{rows}}&_={{ts}}"
53-
LATEST_CATEGORIES = (
51+
BULK_VERSION_CHECK_URL = f"{HOST}/sam/checker.php?threads={{threads}}"
52+
LATEST_UPDATES_SEARCH_URL = f"{HOST}/sam/latest_alpha/latest_data.php?cmd={{cmd}}&cat={{cat}}&page={{page}}&{{search}}={{query}}&sort={{sort}}&rows={{rows}}&_={{ts}}"
53+
LATEST_UPDATES_URL = f"{HOST}/sam/latest_alpha/latest_data.php?cmd={{cmd}}&cat={{cat}}&page={{page}}&sort={{sort}}&rows={{rows}}&_={{ts}}"
54+
LATEST_UPDATES_CATEGORIES = (
5455
"games",
5556
"comics",
5657
"animations",
5758
"assets",
5859
# Doesn't seem to work
5960
# "mods",
6061
)
61-
LATEST_STOPWORDS = (
62-
"a",
63-
"is",
64-
"the",
65-
"an",
66-
"and",
67-
"are",
68-
"as",
69-
"at",
70-
"be",
71-
"but",
72-
"by",
73-
"for",
74-
"if",
75-
"in",
76-
"into",
77-
"it",
78-
"no",
79-
"not",
80-
"of",
81-
"on",
82-
"or",
83-
"such",
84-
"that",
85-
"their",
86-
"then",
87-
"there",
88-
"these",
89-
"they",
90-
"this",
91-
"to",
92-
"was",
93-
"will",
94-
"with",
95-
)
9662

9763

9864
@dataclasses.dataclass(slots=True)
@@ -181,3 +147,61 @@ def check_error(
181147
if isinstance(res, (asyncio.TimeoutError, aiohttp.ClientConnectionError)):
182148
logger.warning("F95zone temporarily unreachable")
183149
return ERROR_F95ZONE_UNAVAILABLE
150+
151+
152+
def latest_updates_search_sanitize_query(query: str):
153+
redis_stopwords = (
154+
"a",
155+
"is",
156+
"the",
157+
"an",
158+
"and",
159+
"are",
160+
"as",
161+
"at",
162+
"be",
163+
"but",
164+
"by",
165+
"for",
166+
"if",
167+
"in",
168+
"into",
169+
"it",
170+
"no",
171+
"not",
172+
"of",
173+
"on",
174+
"or",
175+
"such",
176+
"that",
177+
"their",
178+
"then",
179+
"there",
180+
"these",
181+
"they",
182+
"this",
183+
"to",
184+
"was",
185+
"will",
186+
"with",
187+
)
188+
query = query.encode("ascii", errors="replace").decode()
189+
query = re.sub(r"\.+ | \.+", " ", query)
190+
for char in "?&/':;-.+!~()":
191+
query = query.replace(char, " ")
192+
query = re.sub(r"\s+", " ", query).strip()
193+
words = query.split(" ")
194+
for stopword in redis_stopwords:
195+
for word in words.copy():
196+
if word.lower() == stopword:
197+
words.remove(word)
198+
query = ""
199+
while words:
200+
append = f"{' ' if query else ''}{words.pop(0)}"
201+
if len(query + append) > 30:
202+
append = append[: 30 - len(query)]
203+
if len(append) > 3 and append.strip().lower() not in redis_stopwords:
204+
query += append
205+
break
206+
query += append
207+
return query

indexer/scraper.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import dataclasses
33
import json
44
import logging
5-
import re
65
import time
76

87
from common import parser
@@ -53,7 +52,7 @@ async def thread(id: int) -> dict[str, str] | f95zone.IndexerError | None:
5352
version = ""
5453
try:
5554
async with f95zone.session.get(
56-
f95zone.VERCHK_URL.format(threads=id),
55+
f95zone.BULK_VERSION_CHECK_URL.format(threads=id),
5756
) as req:
5857
res = await req.read()
5958
except Exception as exc:
@@ -79,22 +78,12 @@ async def thread(id: int) -> dict[str, str] | f95zone.IndexerError | None:
7978

8079
# If tracked by latest updates, try to search the thread there to get more precise details
8180
if version:
82-
query = ret.name.encode("ascii", errors="replace").decode()
83-
query = re.sub(r"\.+ | \.+", " ", query)
84-
for char in "?&/':;-":
85-
query = query.replace(char, " ")
86-
query = re.sub(r"\s+", " ", query).strip()[:30]
87-
words = query.split(" ")
88-
for stopword in f95zone.LATEST_STOPWORDS:
89-
for word in words.copy():
90-
if word.lower() == stopword:
91-
words.remove(word)
92-
query = " ".join(words)
93-
for category in f95zone.LATEST_CATEGORIES:
81+
query = f95zone.latest_updates_search_sanitize_query(ret.name)
82+
for category in f95zone.LATEST_UPDATES_CATEGORIES:
9483

9584
try:
9685
async with f95zone.session.get(
97-
f95zone.SEARCH_URL.format(
86+
f95zone.LATEST_UPDATES_SEARCH_URL.format(
9887
cmd="list",
9988
cat=category,
10089
page=1,

indexer/watcher.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
)
1515

1616
WATCH_UPDATES_INTERVAL = dt.timedelta(minutes=5).total_seconds()
17-
WATCH_UPDATES_CATEGORIES = f95zone.LATEST_CATEGORIES
17+
WATCH_UPDATES_CATEGORIES = f95zone.LATEST_UPDATES_CATEGORIES
1818
WATCH_UPDATES_PAGES = 4
1919
WATCH_VERSIONS_INTERVAL = dt.timedelta(hours=12).total_seconds()
2020
WATCH_VERSIONS_CHUNK_SIZE = 1000
@@ -55,7 +55,7 @@ async def poll_updates():
5555

5656
try:
5757
async with f95zone.session.get(
58-
f95zone.LATEST_URL.format(
58+
f95zone.LATEST_UPDATES_URL.format(
5959
cmd="list",
6060
cat=category,
6161
page=page,
@@ -77,9 +77,7 @@ async def poll_updates():
7777
try:
7878
updates = json.loads(res)
7979
except Exception:
80-
raise Exception(
81-
f"Latest updates returned invalid JSON: {res}"
82-
)
80+
raise Exception(f"Latest updates returned invalid JSON: {res}")
8381
if index_error := f95zone.check_error(updates, logger):
8482
raise Exception(index_error)
8583

@@ -162,9 +160,7 @@ async def poll_updates():
162160
# )
163161
# continue
164162
else:
165-
logger.error(
166-
f"Error polling updates: {error.text()}\n{error.traceback()}"
167-
)
163+
logger.error(f"Error polling updates: {error.text()}\n{error.traceback()}")
168164

169165

170166
async def watch_updates():
@@ -179,9 +175,7 @@ async def poll_versions():
179175
try:
180176
logger.info("Poll versions start")
181177

182-
names = [
183-
n async for n in cache.redis.scan_iter("thread:*", 10000, "hash")
184-
]
178+
names = [n async for n in cache.redis.scan_iter("thread:*", 10000, "hash")]
185179
invalidate_cache = cache.redis.pipeline()
186180

187181
for names_chunk in chunks(names, WATCH_VERSIONS_CHUNK_SIZE):
@@ -198,7 +192,7 @@ async def poll_versions():
198192

199193
try:
200194
async with f95zone.session.get(
201-
f95zone.VERCHK_URL.format(threads=csv),
195+
f95zone.BULK_VERSION_CHECK_URL.format(threads=csv),
202196
) as req:
203197
# Await together for efficiency
204198
res, cached_data = await asyncio.gather(
@@ -246,9 +240,7 @@ async def poll_versions():
246240
if len(invalidate_cache):
247241
result = await invalidate_cache.execute()
248242
invalidated = sum(ret != "0" for ret in result)
249-
logger.warning(
250-
f"Versions: Invalidated cache for {invalidated} threads"
251-
)
243+
logger.warning(f"Versions: Invalidated cache for {invalidated} threads")
252244

253245
logger.info("Poll versions done")
254246

@@ -269,9 +261,7 @@ async def poll_versions():
269261
# )
270262
# continue
271263
else:
272-
logger.error(
273-
f"Error polling versions: {error.text()}\n{error.traceback()}"
274-
)
264+
logger.error(f"Error polling versions: {error.text()}\n{error.traceback()}")
275265

276266

277267
async def watch_versions():

modules/api.py

Lines changed: 47 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -450,53 +450,61 @@ def cleanup_temp_files():
450450
pass
451451

452452

453-
latest_updates_search_redis_stopwords = [
454-
"a",
455-
"is",
456-
"the",
457-
"an",
458-
"and",
459-
"are",
460-
"as",
461-
"at",
462-
"be",
463-
"but",
464-
"by",
465-
"for",
466-
"if",
467-
"in",
468-
"into",
469-
"it",
470-
"no",
471-
"not",
472-
"of",
473-
"on",
474-
"or",
475-
"such",
476-
"that",
477-
"their",
478-
"then",
479-
"there",
480-
"these",
481-
"they",
482-
"this",
483-
"to",
484-
"was",
485-
"will",
486-
"with",
487-
]
488453
def latest_updates_search_sanitize_query(query: str):
454+
redis_stopwords = (
455+
"a",
456+
"is",
457+
"the",
458+
"an",
459+
"and",
460+
"are",
461+
"as",
462+
"at",
463+
"be",
464+
"but",
465+
"by",
466+
"for",
467+
"if",
468+
"in",
469+
"into",
470+
"it",
471+
"no",
472+
"not",
473+
"of",
474+
"on",
475+
"or",
476+
"such",
477+
"that",
478+
"their",
479+
"then",
480+
"there",
481+
"these",
482+
"they",
483+
"this",
484+
"to",
485+
"was",
486+
"will",
487+
"with",
488+
)
489489
query = query.encode("ascii", errors="replace").decode()
490490
query = re.sub(r"\.+ | \.+", " ", query)
491-
for char in "?&/':;-.":
491+
for char in "?&/':;-.+!~()":
492492
query = query.replace(char, " ")
493-
query = re.sub(r"\s+", " ", query).strip()[:30]
493+
query = re.sub(r"\s+", " ", query).strip()
494494
words = query.split(" ")
495-
for stopword in latest_updates_search_redis_stopwords:
495+
for stopword in redis_stopwords:
496496
for word in words.copy():
497497
if word.lower() == stopword:
498498
words.remove(word)
499-
query = " ".join(words)
499+
query = ""
500+
while words:
501+
append = f"{' ' if query else ''}{words.pop(0)}"
502+
if len(query + append) > 30:
503+
append = append[: 30 - len(query)]
504+
if len(append) > 3 and append.strip().lower() not in redis_stopwords:
505+
query += append
506+
break
507+
query += append
500508
return query
501509

502510

0 commit comments

Comments
 (0)