diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index e6ab217c5b..10105d8c7e 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -16,10 +16,10 @@ from __future__ import annotations -import difflib import errno import itertools import json +import math import os.path import re import struct @@ -30,7 +30,7 @@ from functools import cached_property, partial, total_ordering from http import HTTPStatus from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator -from urllib.parse import quote, urlencode +from urllib.parse import quote, urlencode, urlparse import requests from typing_extensions import TypedDict @@ -38,6 +38,7 @@ import beets from beets import plugins, ui +from beets.autotag.hooks import string_dist if TYPE_CHECKING: from beets.importer import ImportTask @@ -485,15 +486,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None: return lyrics -class Genius(Backend): +class SearchBackend(Backend): + REQUIRES_BS = True + + @cached_property + def dist_thresh(self) -> float: + return self.config["dist_thresh"].get(float) + + def check_match( + self, target_artist: str, target_title: str, artist: str, title: str + ) -> bool: + """Check if the given artist and title are 'good enough' match.""" + max_dist = max( + string_dist(target_artist, artist), + string_dist(target_title, title), + ) + + if (max_dist := round(max_dist, 2)) <= self.dist_thresh: + return True + + if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4): + # log out the candidate that did not make it but was close. + # This may show a matching candidate with some noise in the name + self._log.debug( + "({}, {}) does not match ({}, {}) but dist was close: {:.2f}", + artist, + title, + target_artist, + target_title, + max_dist, + ) + + return False + + +class Genius(SearchBackend): """Fetch lyrics from Genius via genius-api. Simply adapted from bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ """ - REQUIRES_BS = True - base_url = "https://api.genius.com" def __init__(self, config, log): @@ -516,19 +549,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None: self._log.debug("Genius API request returned invalid JSON") return None - # find a matching artist in the json + check = partial(self.check_match, artist, title) for hit in json["response"]["hits"]: - hit_artist = hit["result"]["primary_artist"]["name"] - - if slug(hit_artist) == slug(artist): - html = self.fetch_url(hit["result"]["url"]) + result = hit["result"] + if check(result["primary_artist"]["name"], result["title"]): + html = self.fetch_url(result["url"]) if not html: return None return self._scrape_lyrics_from_html(html) - self._log.debug( - "Genius failed to find a matching artist for '{0}'", artist - ) return None def _search(self, artist, title): @@ -724,10 +753,9 @@ def is_text_notcode(text): return None -class Google(Backend): +class Google(SearchBackend): """Fetch lyrics from Google search results.""" - REQUIRES_BS = True SEARCH_URL = "https://www.googleapis.com/customsearch/v1" def is_lyrics(self, text, artist=None): @@ -775,21 +803,20 @@ def slugify(self, text): BY_TRANS = ["by", "par", "de", "von"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] - def is_page_candidate(self, url_link, url_title, title, artist): + def is_page_candidate( + self, artist: str, title: str, url_link: str, url_title: str + ) -> bool: """Return True if the URL title makes it a good candidate to be a page that contains lyrics of title by artist. """ - title = self.slugify(title.lower()) - artist = self.slugify(artist.lower()) - sitename = re.search( - "//([^/]+)/.*", self.slugify(url_link.lower()) - ).group(1) - url_title = self.slugify(url_title.lower()) - - # Check if URL title contains song title (exact match) - if url_title.find(title) != -1: + title_slug = self.slugify(title.lower()) + url_title_slug = self.slugify(url_title.lower()) + if title_slug in url_title_slug: return True + artist = self.slugify(artist.lower()) + sitename = urlparse(url_link).netloc + # or try extracting song title from URL title and check if # they are close enough tokens = ( @@ -798,12 +825,9 @@ def is_page_candidate(self, url_link, url_title, title, artist): + self.LYRICS_TRANS ) tokens = [re.escape(t) for t in tokens] - song_title = re.sub("(%s)" % "|".join(tokens), "", url_title) + song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug) - song_title = song_title.strip("_|") - typo_ratio = 0.9 - ratio = difflib.SequenceMatcher(None, song_title, title).ratio() - return ratio >= typo_ratio + return self.check_match(artist, title_slug, artist, song_title) def fetch(self, artist: str, title: str, *_) -> str | None: params = { @@ -825,24 +849,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None: self._log.debug("google backend error: {0}", reason) return None - if "items" in data.keys(): - for item in data["items"]: - url_link = item["link"] - url_title = item.get("title", "") - if not self.is_page_candidate( - url_link, url_title, title, artist - ): - continue - html = self.fetch_url(url_link) - if not html: - continue - lyrics = scrape_lyrics_from_html(html) - if not lyrics: - continue - - if self.is_lyrics(lyrics, artist): - self._log.debug("got lyrics from {0}", item["displayLink"]) - return lyrics + check_candidate = partial(self.is_page_candidate, artist, title) + for item in data.get("items", []): + url_link = item["link"] + if not check_candidate(url_link, item.get("title", "")): + continue + html = self.fetch_url(url_link) + if not html: + continue + lyrics = scrape_lyrics_from_html(html) + if not lyrics: + continue + + if self.is_lyrics(lyrics, artist): + self._log.debug("got lyrics from {0}", item["displayLink"]) + return lyrics return None @@ -866,6 +887,7 @@ def __init__(self): "bing_client_secret": None, "bing_lang_from": [], "bing_lang_to": None, + "dist_thresh": 0.11, "google_API_key": None, "google_engine_ID": "009217259823014548361:lndtuqkycfu", "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W" @@ -877,7 +899,6 @@ def __init__(self): # Musixmatch is disabled by default as they are currently blocking # requests with the beets user agent. "sources": [s for s in self.SOURCES if s != "musixmatch"], - "dist_thresh": 0.1, } ) self.config["bing_client_secret"].redact = True diff --git a/docs/changelog.rst b/docs/changelog.rst index 48b91c44c8..737631971f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -11,6 +11,10 @@ New features: * :doc:`/plugins/substitute`: Allow the replacement string to use capture groups from the match. It is thus possible to create more general rules, applying to many different artists at once. +* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to + control the maximum allowed distance between the lyrics search result and the + tagged item's artist and title. This is useful for preventing false positives + when fetching lyrics. Bug fixes: @@ -28,6 +32,9 @@ Bug fixes: ``lrclib`` over other sources since it returns reliable results quicker than others. :bug:`5102` +* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able + to match lyrics when there is a slight variation in the artist name. + :bug:`4791` For packagers: diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index d1f434d70f..d080b1f940 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -42,6 +42,12 @@ configuration file. The available options are: Default: ``[]`` - **bing_lang_to**: Language to translate lyrics into. Default: None. +- **dist_thresh**: The maximum distance between the artist and title + combination of the music file and lyrics candidate to consider them a match. + Lower values will make the plugin more strict, higher values will make it + more lenient. This does not apply to the ``lrclib`` backend as it matches + durations. + Default: ``0.11``. - **fallback**: By default, the file will be left unchanged when no lyrics are found. Use the empty string ``''`` to reset the lyrics in such a case. Default: None. diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 0dee427ec3..0d625f8b6d 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -161,6 +161,42 @@ def test_slug(self, text, expected): assert lyrics.slug(text) == expected +class TestSearchBackend: + @pytest.fixture + def backend(self, dist_thresh): + plugin = lyrics.LyricsPlugin() + plugin.config.set({"dist_thresh": dist_thresh}) + return lyrics.SearchBackend(plugin.config, plugin._log) + + @pytest.mark.parametrize( + "dist_thresh, target_artist, artist, should_match", + [ + (0.11, "Target Artist", "Target Artist", True), + (0.11, "Target Artist", "Target Artis", True), + (0.11, "Target Artist", "Target Arti", False), + (0.11, "Psychonaut", "Psychonaut (BEL)", True), + (0.11, "beets song", "beats song", True), + (0.10, "beets song", "beats song", False), + ( + 0.11, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + False, + ), + ( + 0.12, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + True, + ), + ], + ) + def test_check_match(self, backend, target_artist, artist, should_match): + assert ( + backend.check_match(target_artist, "", artist, "") == should_match + ) + + @pytest.fixture(scope="module") def lyrics_root_dir(pytestconfig: pytest.Config): return pytestconfig.rootpath / "test" / "rsrc" / "lyrics" @@ -275,10 +311,10 @@ def test_is_page_candidate( self, backend, lyrics_html, url_title, artist, should_be_candidate ): result = backend.is_page_candidate( + artist, + self.TITLE, "http://www.example.com/lyrics/beetssong", url_title, - self.TITLE, - artist, ) assert bool(result) == should_be_candidate