Skip to content

Commit dde83f6

Browse files
committed
lastgenre: Genre ignorelist based on artist
- Prevents wrong last.fm genres based on a per artist (or global) list of regex patterns that should be ignored. - Genre _ignoring_ happens in two places but mainly: - Right after fetching from last.fm - and in _resolve_genres (via filter_valid or directly). - As a fallback literal string matching can be used instead of supplying a regex pattern New methods: - `artist_for_filter` to find out which (album)artist attribute is the right one in a stage -> ignorelist is artist-based! - `is_ignored` and `drop_ignored_genres` - `load_ignorelist` uses confuse mechanisms to load patterns for each artist and provide them to the plugin as self.ignore_patterns
1 parent 43e13dc commit dde83f6

3 files changed

Lines changed: 226 additions & 37 deletions

File tree

beetsplug/lastgenre/__init__.py

Lines changed: 142 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,19 @@
2525
from __future__ import annotations
2626

2727
import os
28+
import re
29+
from collections import defaultdict
2830
from functools import singledispatchmethod
2931
from pathlib import Path
3032
from typing import TYPE_CHECKING, Any
3133

34+
import confuse
3235
import yaml
3336

3437
from beets import config, library, plugins, ui
3538
from beets.library import Album, Item
3639
from beets.util import plurality, unique_list
40+
from beetsplug.lastgenre.utils import drop_ignored_genres, is_ignored
3741

3842
from .client import LastFmClient
3943

@@ -44,6 +48,8 @@
4448
from beets.importer import ImportSession, ImportTask
4549
from beets.library import LibModel
4650

51+
from .utils import GenreIgnorePatterns
52+
4753
Whitelist = set[str]
4854
"""Set of valid genre names (lowercase). Empty set means all genres allowed."""
4955

@@ -130,6 +136,7 @@ def __init__(self) -> None:
130136
"prefer_specific": False,
131137
"title_case": True,
132138
"pretend": False,
139+
"ignorelist": {},
133140
}
134141
)
135142
self.setup()
@@ -139,12 +146,13 @@ def setup(self) -> None:
139146
if self.config["auto"]:
140147
self.import_stages = [self.imported]
141148

142-
self.client = LastFmClient(
143-
self._log, self.config["min_weight"].get(int)
144-
)
145149
self.whitelist: Whitelist = self._load_whitelist()
146150
self.c14n_branches: CanonTree
147151
self.c14n_branches, self.canonicalize = self._load_c14n_tree()
152+
self.ignore_patterns: GenreIgnorePatterns = self._load_ignorelist()
153+
self.client = LastFmClient(
154+
self._log, self.config["min_weight"].get(int), self.ignore_patterns
155+
)
148156

149157
def _load_whitelist(self) -> Whitelist:
150158
"""Load the whitelist from a text file.
@@ -187,6 +195,57 @@ def _load_c14n_tree(self) -> tuple[CanonTree, bool]:
187195
flatten_tree(genres_tree, [], c14n_branches)
188196
return c14n_branches, canonicalize
189197

198+
def _load_ignorelist(self) -> GenreIgnorePatterns:
199+
r"""Load patterns from configuration and compile them.
200+
201+
Mapping of artist names to regex or literal patterns. Use the
202+
quoted ``'*'`` key to define globally ignored genres::
203+
204+
lastgenre:
205+
ignorelist:
206+
'*':
207+
- spoken word
208+
- comedy
209+
Artist Name:
210+
- .*rock.*
211+
- .*metal.*
212+
213+
Matching is case-insensitive and full-match. Because patterns are
214+
parsed as plain YAML scalars, backslashes (e.g. ``\w``) should
215+
not be double-escaped. Quotes are primarily needed for special
216+
YAML characters (e.g., ``*`` or ``[``); prefer single-quotes.
217+
218+
Raises:
219+
Several confuse.ConfigError's that tell the user about the expected
220+
format when the config is invalid.
221+
"""
222+
if not self.config["ignorelist"].get():
223+
return {}
224+
225+
raw_ignorelist = self.config["ignorelist"].get(
226+
confuse.MappingValues(confuse.Sequence(str))
227+
)
228+
229+
compiled_ignorelist: GenreIgnorePatterns = defaultdict(list)
230+
for artist, patterns in raw_ignorelist.items():
231+
artist_patterns = []
232+
for pattern in patterns:
233+
try:
234+
artist_patterns.append(re.compile(pattern, re.IGNORECASE))
235+
except re.error:
236+
artist_patterns.append(
237+
re.compile(re.escape(pattern), re.IGNORECASE)
238+
)
239+
self._log.extra_debug(
240+
"ignore for {}: {}",
241+
artist,
242+
[p.pattern for p in artist_patterns],
243+
)
244+
245+
compiled_ignorelist[artist] = artist_patterns
246+
247+
return compiled_ignorelist
248+
190249
@property
191250
def sources(self) -> tuple[str, ...]:
192251
"""A tuple of allowed genre sources. May contain 'track',
@@ -202,7 +261,9 @@ def sources(self) -> tuple[str, ...]:
202261

203262
# Genre list processing.
204263

205-
def _resolve_genres(self, tags: list[str]) -> list[str]:
264+
def _resolve_genres(
265+
self, tags: list[str], artist: str | None = None
266+
) -> list[str]:
206267
"""Canonicalize, sort and filter a list of genres.
207268
208269
- Returns an empty list if the input tags list is empty.
@@ -217,6 +278,9 @@ def _resolve_genres(self, tags: list[str]) -> list[str]:
217278
by the specificity (depth in the canonicalization tree) of the genres.
218279
- Finally applies whitelist filtering to ensure that only valid
219280
genres are kept. (This may result in no genres at all being retained).
281+
- Ignorelist is applied at each stage: ignored input tags skip ancestry
282+
entirely, ignored ancestor tags are dropped, and ignored tags are
283+
removed in the final filter.
220284
- Returns the filtered list of genres, limited to the configured count.
221285
"""
222286
if not tags:
@@ -229,14 +293,29 @@ def _resolve_genres(self, tags: list[str]) -> list[str]:
229293
# Extend the list to consider tags parents in the c14n tree
230294
tags_all = []
231295
for tag in tags:
232-
# Add parents that are in the whitelist, or add the oldest
233-
# ancestor if no whitelist
296+
# Skip ignored tags entirely — don't walk their ancestry.
297+
if is_ignored(self._log, self.ignore_patterns, tag, artist):
298+
continue
299+
300+
# Add parents that pass whitelist (and are not ignored, which
301+
# is checked in _filter_valid). With whitelist, we may include
302+
# multiple parents
234303
if self.whitelist:
235304
parents = self._filter_valid(
236-
find_parents(tag, self.c14n_branches)
305+
find_parents(tag, self.c14n_branches),
306+
artist=artist,
237307
)
238308
else:
239-
parents = [find_parents(tag, self.c14n_branches)[-1]]
309+
# No whitelist: take only the oldest ancestor, skipping it
310+
# if it is in the ignorelist
311+
oldest = find_parents(tag, self.c14n_branches)[-1]
312+
parents = (
313+
[]
314+
if is_ignored(
315+
self._log, self.ignore_patterns, oldest, artist
316+
)
317+
else [oldest]
318+
)
240319

241320
tags_all += parents
242321
# Stop if we have enough tags already, unless we need to find
@@ -254,24 +333,34 @@ def _resolve_genres(self, tags: list[str]) -> list[str]:
254333
if self.config["prefer_specific"]:
255334
tags = sort_by_depth(tags, self.c14n_branches)
256335

257-
# c14n only adds allowed genres but we may have had forbidden genres in
258-
# the original tags list
259-
valid_tags = self._filter_valid(tags)
336+
# Final filter: applies when c14n is disabled, or when c14n ran without
337+
# whitelist filtering in the loop (no-whitelist path).
338+
valid_tags = self._filter_valid(tags, artist=artist)
260339
return valid_tags[:count]
261340

262-
def _filter_valid(self, genres: Iterable[str]) -> list[str]:
263-
"""Filter genres based on whitelist.
341+
def _filter_valid(
342+
self, genres: Iterable[str], artist: str | None = None
343+
) -> list[str]:
344+
"""Filter genres through whitelist and ignorelist.
264345
265-
Returns all genres if no whitelist is configured, otherwise returns
266-
only genres that are in the whitelist.
346+
Drops empty/whitespace-only strings, then applies whitelist and
347+
ignorelist checks. Returns all genres if neither is configured.
348+
Whitelist is checked first for performance reasons (ignorelist regex
349+
matching is more expensive and for some call sites ignored genres were
350+
already filtered).
267351
"""
268-
# First, drop any falsy or whitespace-only genre strings to avoid
269-
# retaining empty tags from multi-valued fields.
270352
cleaned = [g for g in genres if g and g.strip()]
271-
if not self.whitelist:
353+
if not self.whitelist and not self.ignore_patterns:
272354
return cleaned
273355

274-
return [g for g in cleaned if g.lower() in self.whitelist]
356+
whitelisted = [
357+
g
358+
for g in cleaned
359+
if not self.whitelist or g.lower() in self.whitelist
360+
]
361+
return drop_ignored_genres(
362+
self._log, self.ignore_patterns, whitelisted, artist
363+
)
275364

276365
# Genre resolution pipeline.
277366

@@ -282,6 +371,14 @@ def _format_genres(self, tags: list[str]) -> list[str]:
282371
else:
283372
return tags
284373

374+
def _artist_for_filter(self, obj: LibModel) -> str | None:
375+
"""Return the representative artist for genre resolution and filtering."""
376+
return (
377+
obj.artist
378+
if isinstance(obj, library.Item)
379+
else obj.albumartist or obj.artist
380+
)
381+
285382
def _get_existing_genres(self, obj: LibModel) -> list[str]:
286383
"""Return a list of genres for this Item or Album."""
287384
if isinstance(obj, library.Item):
@@ -292,13 +389,13 @@ def _get_existing_genres(self, obj: LibModel) -> list[str]:
292389
return genres_list
293390

294391
def _combine_resolve_and_log(
295-
self, old: list[str], new: list[str]
392+
self, old: list[str], new: list[str], artist: str | None = None
296393
) -> list[str]:
297394
"""Combine old and new genres and process via _resolve_genres."""
298395
self._log.debug("raw last.fm tags: {}", new)
299396
self._log.debug("existing genres taken into account: {}", old)
300397
combined = old + new
301-
return self._resolve_genres(combined)
398+
return self._resolve_genres(combined, artist=artist)
302399

303400
def _get_genre(self, obj: LibModel) -> tuple[list[str], str]:
304401
"""Get the final genre list for an Album or Item object.
@@ -321,11 +418,14 @@ def _get_genre(self, obj: LibModel) -> tuple[list[str], str]:
321418
"""
322419

323420
def _try_resolve_stage(
324-
stage_label: str, keep_genres: list[str], new_genres: list[str]
421+
stage_label: str,
422+
keep_genres: list[str],
423+
new_genres: list[str],
424+
artist: str | None = None,
325425
) -> tuple[list[str], str] | None:
326426
"""Try to resolve genres for a given stage and log the result."""
327427
resolved_genres = self._combine_resolve_and_log(
328-
keep_genres, new_genres
428+
keep_genres, new_genres, artist=artist
329429
)
330430
if resolved_genres:
331431
suffix = "whitelist" if self.whitelist else "any"
@@ -345,7 +445,12 @@ def _try_resolve_stage(
345445
# If none are found, we use the fallback (if set).
346446
if self.config["cleanup_existing"]:
347447
keep_genres = [g.lower() for g in genres]
348-
if result := _try_resolve_stage("cleanup", keep_genres, []):
448+
if result := _try_resolve_stage(
449+
"cleanup",
450+
keep_genres,
451+
[],
452+
artist=self._artist_for_filter(obj),
453+
):
349454
return result
350455

351456
# Return fallback string (None if not set).
@@ -368,7 +473,7 @@ def _try_resolve_stage(
368473
obj.artist, obj.title
369474
):
370475
if result := _try_resolve_stage(
371-
"track", keep_genres, new_genres
476+
"track", keep_genres, new_genres, artist=obj.artist
372477
):
373478
return result
374479

@@ -377,18 +482,21 @@ def _try_resolve_stage(
377482
obj.albumartist, obj.album
378483
):
379484
if result := _try_resolve_stage(
380-
"album", keep_genres, new_genres
485+
"album", keep_genres, new_genres, artist=obj.albumartist
381486
):
382487
return result
383488

384489
if "artist" in self.sources:
385490
new_genres = []
491+
stage_artist: str | None = None
386492
if isinstance(obj, library.Item):
387493
new_genres = self.client.fetch_artist_genre(obj.artist)
388494
stage_label = "artist"
495+
stage_artist = obj.artist
389496
elif obj.albumartist != config["va_name"].as_str():
390497
new_genres = self.client.fetch_artist_genre(obj.albumartist)
391498
stage_label = "album artist"
499+
stage_artist = obj.albumartist
392500
if not new_genres:
393501
self._log.extra_debug(
394502
'No album artist genre found for "{}", '
@@ -405,6 +513,9 @@ def _try_resolve_stage(
405513
)
406514
if new_genres:
407515
stage_label = "multi-valued album artist"
516+
stage_artist = (
517+
None # Already filtered per-artist in client
518+
)
408519
else:
409520
# For "Various Artists", pick the most popular track genre.
410521
item_genres = []
@@ -431,18 +542,19 @@ def _try_resolve_stage(
431542

432543
if new_genres:
433544
if result := _try_resolve_stage(
434-
stage_label, keep_genres, new_genres
545+
stage_label, keep_genres, new_genres, artist=stage_artist
435546
):
436547
return result
437548

438549
# Nothing found, leave original if configured and valid.
439-
if genres and self.config["keep_existing"]:
440-
if valid_genres := self._filter_valid(genres):
550+
if genres and self.config["keep_existing"].get():
551+
artist = self._artist_for_filter(obj)
552+
if valid_genres := self._filter_valid(genres, artist=artist):
441553
return valid_genres, "original fallback"
442554
# If the original genre doesn't match a whitelisted genre, check
443555
# if we can canonicalize it to find a matching, whitelisted genre!
444556
if result := _try_resolve_stage(
445-
"original fallback", keep_genres, []
557+
"original fallback", keep_genres, [], artist=artist
446558
):
447559
return result
448560

0 commit comments

Comments
 (0)