Skip to content

Commit 7fec310

Browse files
committed
Improve lyrics lookup by removing remix & remaster info in query.
1 parent d051d15 commit 7fec310

File tree

1 file changed

+56
-0
lines changed

1 file changed

+56
-0
lines changed

lyric_finder/src/lib.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ impl Client {
5858

5959
/// Search songs satisfying a given `query`.
6060
pub async fn search_songs(&self, query: &str) -> anyhow::Result<Vec<search::Result>> {
61+
let query = improve_query(query);
62+
6163
log::debug!("search songs: query={query}");
6264

6365
let body = self
@@ -136,6 +138,60 @@ impl Default for Client {
136138
}
137139
}
138140

141+
/// Returns `query` without `remaster` & `remix` information from track/artist query.
142+
/// Returned value is lowercase.
143+
/// These caused wildly invalid lyrics to be found.
144+
/// (try yourself adding remastered 2011 to a song's name when searching in Genius!)
145+
fn improve_query(query: &str) -> String {
146+
let is_dash = |c: char| c == '-' || c == '—' || c == '–';
147+
148+
// reverse finder for non-filler (space, dashes) chars.
149+
// Acts like a trim to remove undesired spaces and dashes.
150+
let r_no_filler = |s: &str, idx: usize| {
151+
let Some(s) = s.get(..idx) else { return idx };
152+
s.char_indices()
153+
.rev()
154+
.find(|(_, c)| !(is_dash(*c) || c.is_whitespace()))
155+
.map_or(idx, |(idx, c)| idx + c.len_utf8())
156+
};
157+
// used to handle longer variants of words: `remixed`, `remastered`, etc.
158+
let end_of_word = |s: &str, idx: usize| {
159+
let Some(s) = s.get(idx..) else { return idx };
160+
s.find(|c| !c.is_alphanumeric())
161+
.map_or(idx, |found| found + idx)
162+
};
163+
164+
let mut query = query.to_lowercase();
165+
// just cut xxxx Remaster from query
166+
if let Some(remaster_start) = query.find("remaster") {
167+
let end = remaster_start + "remaster".len();
168+
let mut trim_start = remaster_start.saturating_sub(1);
169+
let prev = query.get(..remaster_start.saturating_sub(2)).unwrap_or("");
170+
let end_of_prev_word = prev.rfind(' ').unwrap_or(0);
171+
172+
if let Some(year) = query.get(end_of_prev_word + 1..remaster_start.saturating_sub(1)) {
173+
if year.chars().all(|c| c.is_whitespace() || c.is_numeric()) {
174+
trim_start = end_of_prev_word;
175+
}
176+
}
177+
trim_start = r_no_filler(&query, trim_start);
178+
let end = end_of_word(&query, end);
179+
query.drain(trim_start..end);
180+
}
181+
// just cut anything off after - for remix
182+
if let Some(remix_start) = query.find("remix") {
183+
let end = remix_start + "remix".len();
184+
let end = end_of_word(&query, end);
185+
if let Some(metadata_start) = query.rfind(is_dash) {
186+
if metadata_start > 4 {
187+
let start = r_no_filler(&query, metadata_start);
188+
query.drain(start..end);
189+
}
190+
}
191+
}
192+
query
193+
}
194+
139195
mod parse {
140196
use html5ever::tendril::TendrilSink;
141197
use html5ever::*;

0 commit comments

Comments
 (0)