Skip to content

Commit a9029f6

Browse files
authored
Add scan cancellation and add application log viewer (#8)
* Add the ability to cancel a scan. * Improve logging and provide view into current logs in application.
1 parent be06104 commit a9029f6

15 files changed

Lines changed: 1232 additions & 51 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ Tags are applied (or updated) every time the library is rescanned. Tags set via
264264
| `DATA_PATH` | `/data` | Path for the database, thumbnails, and search cache |
265265
| `WORKERS` | `2` | Number of uvicorn worker processes |
266266
| `VALKEY_URL` || Optional Redis-compatible cache URL for rendered page images (e.g. `redis://valkey:6379/0`) |
267+
| `LOG_LEVEL` | `info` | Optional Console/Docker log verbosity: `debug`, `info`, `warning`, `error`, or `critical`. The in-app Logs tab (Settings → Logs) always captures `debug`-level entries regardless of this setting. |
267268

268269
### Volumes
269270

backend/config.py

Lines changed: 117 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
"""Shared configuration, database, and cache setup for Grimoire."""
22
import os
33
import logging
4+
import collections
5+
import threading
6+
import datetime
47
from typing import Optional
58
from .models import init_db
69

@@ -14,8 +17,121 @@
1417
VALKEY_URL = os.environ.get("VALKEY_URL", "")
1518
_PAGE_CACHE_HEADERS = {"Cache-Control": "max-age=31536000, immutable"}
1619

17-
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s")
20+
# Console log level is controlled by the LOG_LEVEL env var (default: info).
21+
# In-memory ring buffer always captures DEBUG+ so the /api/logs endpoint can
22+
# serve debug logs regardless of the console level.
23+
_LOG_LEVEL_NAME = os.environ.get("LOG_LEVEL", "info").upper()
24+
_CONSOLE_LEVEL = getattr(logging, _LOG_LEVEL_NAME, logging.INFO)
25+
26+
_LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
27+
logging.basicConfig(level=logging.DEBUG, format=_LOG_FORMAT)
28+
29+
for _noisy in ("uvicorn", "uvicorn.access", "uvicorn.error", "fastapi", "sqlalchemy.engine"):
30+
logging.getLogger(_noisy).setLevel(logging.WARNING)
31+
32+
for _h in logging.root.handlers:
33+
_h.setLevel(_CONSOLE_LEVEL)
34+
1835
logger = logging.getLogger("grimoire")
36+
logger.setLevel(logging.DEBUG)
37+
38+
_LOG_BUFFER_MAX = 20000
39+
40+
_seq_counter = 0
41+
42+
43+
class _LogEntry:
44+
"""Lightweight log record stored in the ring buffer."""
45+
__slots__ = ("seq", "timestamp", "level", "logger", "message")
46+
47+
def __init__(self, seq: int, timestamp: str, level: str, logger_name: str, message: str):
48+
self.seq = seq
49+
self.timestamp = timestamp
50+
self.level = level
51+
self.logger = logger_name
52+
self.message = message
53+
54+
def to_dict(self) -> dict:
55+
return {
56+
"seq": self.seq,
57+
"timestamp": self.timestamp,
58+
"level": self.level,
59+
"logger": self.logger,
60+
"message": self.message,
61+
}
62+
63+
64+
class _MemoryLogHandler(logging.Handler):
65+
"""Thread-safe ring-buffer log handler for in-app log viewing."""
66+
67+
def __init__(self, maxlen: int = _LOG_BUFFER_MAX):
68+
super().__init__(level=logging.DEBUG)
69+
self._buf: collections.deque[_LogEntry] = collections.deque(maxlen=maxlen)
70+
self._lock = threading.Lock()
71+
72+
def emit(self, record: logging.LogRecord) -> None:
73+
global _seq_counter
74+
try:
75+
ts = datetime.datetime.fromtimestamp(
76+
record.created, tz=datetime.timezone.utc
77+
).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
78+
with self._lock:
79+
_seq_counter += 1
80+
entry = _LogEntry(
81+
seq=_seq_counter,
82+
timestamp=ts,
83+
level=record.levelname,
84+
logger_name=record.name,
85+
message=self.format(record),
86+
)
87+
self._buf.append(entry)
88+
except Exception:
89+
self.handleError(record)
90+
91+
def get_entries(
92+
self,
93+
min_level: int = logging.DEBUG,
94+
limit: int = 500,
95+
offset: int = 0,
96+
after_seq: int = 0,
97+
) -> tuple[list[dict], int]:
98+
"""Return entries in oldest-to-newest order and the current max seq.
99+
100+
When `after_seq` > 0, returns only entries with seq > after_seq (up to
101+
`limit`), ignoring `offset`. This is the fast path for live polling.
102+
103+
When `after_seq` == 0 (initial / historical load), `offset` is counted
104+
from the newest end: offset=0 → most-recent `limit` entries,
105+
offset=limit → next-older page, etc.
106+
107+
Returns (entries_list, max_seq_in_buffer).
108+
"""
109+
with self._lock:
110+
all_entries = [e for e in self._buf if logging.getLevelName(e.level) >= min_level] # type: ignore[arg-type]
111+
max_seq = self._buf[-1].seq if self._buf else 0
112+
113+
if after_seq > 0:
114+
new = [e for e in all_entries if e.seq > after_seq]
115+
return [e.to_dict() for e in new[-limit:]], max_seq
116+
117+
total = len(all_entries)
118+
end = total - offset
119+
start = max(0, end - limit)
120+
return [e.to_dict() for e in all_entries[start:end]], max_seq
121+
122+
def get_total(self, min_level: int = logging.DEBUG) -> int:
123+
with self._lock:
124+
return sum(1 for e in self._buf if logging.getLevelName(e.level) >= min_level) # type: ignore[arg-type]
125+
126+
def clear(self) -> None:
127+
with self._lock:
128+
self._buf.clear()
129+
130+
131+
_memory_handler = _MemoryLogHandler()
132+
_memory_handler.setFormatter(logging.Formatter("%(message)s"))
133+
134+
logging.root.addHandler(_memory_handler)
19135

20136
os.makedirs(DATA_PATH, exist_ok=True)
21137
os.makedirs(THUMB_DIR, exist_ok=True)
@@ -34,7 +150,6 @@ def get_db():
34150
db.close()
35151

36152

37-
# Optional Valkey page cache
38153
_valkey: Optional[object] = None
39154
if VALKEY_URL:
40155
try:

backend/indexer.py

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,12 @@
2020
_FITZ_TIMEOUT = 300 # seconds
2121

2222

23-
def _fitz_open_with_timeout(filepath: str, timeout: int = _FITZ_TIMEOUT):
24-
"""Open a PDF with fitz, raising TimeoutError if it hangs beyond `timeout` seconds."""
23+
def _fitz_open_with_timeout(filepath: str, timeout: int = _FITZ_TIMEOUT, should_stop=None):
24+
"""Open a PDF with fitz, raising TimeoutError if it hangs beyond `timeout` seconds.
25+
26+
If `should_stop` callable is provided, the wait is interrupted early when it
27+
returns True, raising TimeoutError so the caller can exit cleanly.
28+
"""
2529
result = [None]
2630
exc = [None]
2731

@@ -33,7 +37,14 @@ def _open():
3337

3438
t = threading.Thread(target=_open, daemon=True)
3539
t.start()
36-
t.join(timeout)
40+
deadline = timeout
41+
poll_interval = 0.5 # check stop flag every 500ms
42+
elapsed = 0.0
43+
while t.is_alive() and elapsed < deadline:
44+
t.join(poll_interval)
45+
elapsed += poll_interval
46+
if should_stop and should_stop():
47+
raise TimeoutError(f"fitz.open() aborted by stop request for {filepath}")
3748
if t.is_alive():
3849
raise TimeoutError(f"fitz.open() timed out after {timeout}s for {filepath}")
3950
if exc[0] is not None:
@@ -85,12 +96,12 @@ def guess_category(filepath: str) -> str:
8596
return "core"
8697

8798

88-
def generate_thumbnail(filepath: str, output_path: str, size: tuple = (300, 400)) -> bool:
99+
def generate_thumbnail(filepath: str, output_path: str, size: tuple = (300, 400), should_stop=None) -> bool:
89100
"""Generate a thumbnail from the first page of a PDF or from an image."""
90101
try:
91102
ext = Path(filepath).suffix.lower()
92103
if ext == ".pdf":
93-
doc = _fitz_open_with_timeout(filepath)
104+
doc = _fitz_open_with_timeout(filepath, should_stop=should_stop)
94105
if len(doc) == 0:
95106
return False
96107
page = doc[0]
@@ -114,11 +125,11 @@ def generate_thumbnail(filepath: str, output_path: str, size: tuple = (300, 400)
114125
return False
115126

116127

117-
def extract_text_from_pdf(filepath: str) -> list[dict]:
128+
def extract_text_from_pdf(filepath: str, should_stop=None) -> list[dict]:
118129
"""Extract text from all pages of a PDF. Returns list of {page, content}."""
119130
pages = []
120131
try:
121-
doc = _fitz_open_with_timeout(filepath)
132+
doc = _fitz_open_with_timeout(filepath, should_stop=should_stop)
122133
for i, page in enumerate(doc):
123134
page_text = page.get_text().strip()
124135
if page_text:
@@ -140,11 +151,13 @@ def _count_eligible_files(directory: Path, extensions: set) -> int:
140151
return count
141152

142153

143-
def scan_library(library_path: str, data_path: str, session: Session, on_progress=None):
154+
def scan_library(library_path: str, data_path: str, session: Session, on_progress=None, should_stop=None):
144155
"""Scan the library directory and register all files in the database.
145156
146157
on_progress(scanned_books, total_books, scanned_maps, total_maps, scanned_tokens, total_tokens)
147158
is called after each file is processed if provided.
159+
160+
should_stop() is an optional callable that returns True when the scan should abort early.
148161
"""
149162
library = Path(library_path)
150163
books_dir = library / "books"
@@ -214,13 +227,18 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
214227
scanned_tokens,
215228
total_tokens,
216229
)
230+
if should_stop and should_stop():
231+
logger.info("scan_library: stop requested during books scan.")
232+
return stats
217233

218234
relative_path = os.path.relpath(filepath, library_path)
219235

220236
existing = session.query(Book).filter_by(filepath=filepath).first()
221237
if existing:
238+
logger.debug(f"File scan: already registered, skipping: {filename}")
222239
continue
223240

241+
logger.debug(f"File scan: new book found: {filename}")
224242
category = guess_category(relative_path)
225243
title = Path(filename).stem.replace("_", " ").replace("-", " ").strip()
226244

@@ -246,12 +264,12 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
246264
"books",
247265
f"{slugify(title)}_{hashlib.md5(filepath.encode()).hexdigest()[:8]}.webp",
248266
)
249-
if generate_thumbnail(filepath, thumb_path):
267+
if generate_thumbnail(filepath, thumb_path, should_stop=should_stop):
250268
book.has_thumbnail = True
251269

252270
if ext == ".pdf":
253271
try:
254-
doc = _fitz_open_with_timeout(filepath)
272+
doc = _fitz_open_with_timeout(filepath, should_stop=should_stop)
255273
book.page_count = len(doc)
256274
doc.close()
257275
except Exception as e:
@@ -291,13 +309,18 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
291309
scanned_tokens,
292310
total_tokens,
293311
)
312+
if should_stop and should_stop():
313+
logger.info("scan_library: stop requested during maps scan.")
314+
return stats
294315

295316
relative_path = os.path.relpath(filepath, library_path)
296317

297318
existing = session.query(GenericMap).filter_by(filepath=filepath).first()
298319
if existing:
320+
logger.debug(f"File scan: already registered, skipping: {filename}")
299321
continue
300322

323+
logger.debug(f"File scan: new map found: {filename}")
301324
title = Path(filename).stem.replace("_", " ").replace("-", " ").strip()
302325

303326
try:
@@ -318,7 +341,7 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
318341
"maps",
319342
f"{slugify(title)}_{hashlib.md5(filepath.encode()).hexdigest()[:8]}.webp",
320343
)
321-
if generate_thumbnail(filepath, thumb_path):
344+
if generate_thumbnail(filepath, thumb_path, should_stop=should_stop):
322345
gmap.has_thumbnail = True
323346

324347
session.add(gmap)
@@ -353,13 +376,18 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
353376
scanned_tokens,
354377
total_tokens,
355378
)
379+
if should_stop and should_stop():
380+
logger.info("scan_library: stop requested during tokens scan.")
381+
return stats
356382

357383
relative_path = os.path.relpath(filepath, library_path)
358384

359385
existing = session.query(Token).filter_by(filepath=filepath).first()
360386
if existing:
387+
logger.debug(f"File scan: already registered, skipping: {filename}")
361388
continue
362389

390+
logger.debug(f"File scan: new token found: {filename}")
363391
title = Path(filename).stem.replace("_", " ").replace("-", " ").strip()
364392

365393
try:
@@ -380,7 +408,7 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
380408
"tokens",
381409
f"{slugify(title)}_{hashlib.md5(filepath.encode()).hexdigest()[:8]}.webp",
382410
)
383-
if generate_thumbnail(filepath, thumb_path, size=(200, 200)):
411+
if generate_thumbnail(filepath, thumb_path, size=(200, 200), should_stop=should_stop):
384412
token.has_thumbnail = True
385413

386414
session.add(token)
@@ -499,12 +527,12 @@ def _apply_tags_from_library(library_path: str, session: Session) -> None:
499527
session.commit()
500528

501529

502-
def index_book_text(book: Book, data_path: str, session: Session):
530+
def index_book_text(book: Book, data_path: str, session: Session, should_stop=None):
503531
"""Extract and index text from a PDF for full-text search."""
504532
if book.indexed or book.index_failed or book.mime_type != "application/pdf":
505533
return False
506534

507-
pages = extract_text_from_pdf(book.filepath)
535+
pages = extract_text_from_pdf(book.filepath, should_stop=should_stop)
508536
if not pages:
509537
book.index_error = "No text extracted"
510538
book.index_failed = True
@@ -523,5 +551,5 @@ def index_book_text(book: Book, data_path: str, session: Session):
523551
book.indexed = True
524552
book.index_error = ""
525553
session.commit()
526-
logger.info(f"Indexed {len(pages)} pages for: {book.title}")
554+
logger.info(f"Indexed {len(pages)} pages for: {book.filename} ('{book.title}')")
527555
return True

backend/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from .routers import maintenance as maintenance_router
2525
from .routers import settings as settings_router
2626
from .routers import campaigns as campaigns_router
27+
from .routers import logs as logs_router
2728
from .routers.library import run_rescan_sync
2829
from . import scheduler
2930
from . import session_creator
@@ -63,6 +64,7 @@
6364
},
6465
{"name": "settings", "description": "Application settings. **Admin only.**"},
6566
{"name": "maintenance", "description": "Admin housekeeping tasks."},
67+
{"name": "logs", "description": "Application log retrieval. **Admin only.**"},
6668
]
6769

6870

@@ -139,6 +141,7 @@ def do_scan():
139141
api.include_router(maintenance_router.router)
140142
api.include_router(settings_router.router)
141143
api.include_router(campaigns_router.router)
144+
api.include_router(logs_router.router)
142145
app.include_router(api)
143146

144147

0 commit comments

Comments
 (0)