Skip to content

Commit bd1659a

Browse files
Icelkaome510
andauthored
Improve lyrics lookup by removing remix & remaster info in query. (#266)
Co-authored-by: Thang Pham <[email protected]>
1 parent d051d15 commit bd1659a

File tree

2 files changed

+63
-0
lines changed

2 files changed

+63
-0
lines changed

lyric_finder/rustfmt.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
edition="2021"

lyric_finder/src/lib.rs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ impl Client {
5858

5959
/// Search songs satisfying a given `query`.
6060
pub async fn search_songs(&self, query: &str) -> anyhow::Result<Vec<search::Result>> {
61+
let query = improve_query(query);
62+
6163
log::debug!("search songs: query={query}");
6264

6365
let body = self
@@ -136,6 +138,66 @@ impl Default for Client {
136138
}
137139
}
138140

141+
/// Returns `query` without `remaster` & `remix` information from track/artist query.
142+
/// Returned value is lowercase.
143+
/// These caused wildly invalid lyrics to be found.
144+
/// (try yourself adding remastered 2011 to a song's name when searching in Genius!)
145+
fn improve_query(query: &str) -> String {
146+
// flag for doing something wrong if the song name (after removing remix metadata) is too short.
147+
const SONG_MIN_LENGTH_WO_REMIX_METADATA: usize = 3;
148+
149+
let is_dash = |c: char| c == '-';
150+
151+
// reverse finder for non-filler (space, dashes) chars before an index.
152+
// Acts like a trim to remove undesired spaces and dashes.
153+
let rfind_non_filler = |s: &str, idx: usize| {
154+
let Some(s) = s.get(..idx) else { return idx };
155+
s.char_indices()
156+
.rfind(|(_, c)| !(is_dash(*c) || c.is_whitespace()))
157+
.map_or(idx, |(idx, c)| idx + c.len_utf8())
158+
};
159+
// used to handle longer variants of words: `remixed`, `remastered`, etc.
160+
let end_of_word = |s: &str, idx: usize| {
161+
let Some(s) = s.get(idx..) else { return idx };
162+
s.find(|c: char| !c.is_alphanumeric())
163+
.map_or(idx, |found| found + idx)
164+
};
165+
166+
let mut query = query.to_lowercase();
167+
// remove "xxxx Remaster" from the query
168+
// For example, `{song} xxxx Remastered {artists}` becomes `{song} {artists}`.
169+
if let Some(remaster_start) = query.find("remaster") {
170+
let end = remaster_start + "remaster".len();
171+
let end = end_of_word(&query, end);
172+
173+
let mut start = remaster_start.saturating_sub(1);
174+
let prev = query.get(..remaster_start.saturating_sub(2)).unwrap_or("");
175+
let end_of_prev_word = prev.rfind(' ').unwrap_or(0);
176+
177+
if let Some(year) = query.get(end_of_prev_word + 1..remaster_start.saturating_sub(1)) {
178+
if year.chars().all(|c| c.is_whitespace() || c.is_numeric()) {
179+
start = end_of_prev_word;
180+
}
181+
}
182+
start = rfind_non_filler(&query, start);
183+
query.drain(start..end);
184+
}
185+
// remove "- xxxx yyy remix" from the query
186+
// For example, `{song} - xxxx yyy remix {artists}` becomes `{song} {artists}`.
187+
if let Some(remix_start) = query.find("remix") {
188+
let end = remix_start + "remix".len();
189+
let end = end_of_word(&query, end);
190+
191+
if let Some(metadata_start) = query.rfind(is_dash) {
192+
if metadata_start >= SONG_MIN_LENGTH_WO_REMIX_METADATA {
193+
let start = rfind_non_filler(&query, metadata_start);
194+
query.drain(start..end);
195+
}
196+
}
197+
}
198+
query
199+
}
200+
139201
mod parse {
140202
use html5ever::tendril::TendrilSink;
141203
use html5ever::*;

0 commit comments

Comments
 (0)