Skip to content

Commit b4b1783

Browse files
stuhoodmdashti
authored andcommitted
perf: Implement a TermSet variant which uses fast fields (#69)
The `TermSet` `Query` currently produces one `Scorer`/`DocSet` per matched term by scanning the term dictionary and then consuming posting lists. For very large sets of terms and a fast field, it is faster to scan the fast field column while intersecting with a `HashSet` of (encoded) term values. Following the pattern set by the two execution modes of `RangeQuery`, this PR introduces a variant of `TermSet` which uses fast fields, and then uses it when there are more than 1024 input terms (an arbitrary threshold!). Performance is significantly improved for large `TermSet`s of primitives.
1 parent 11d6671 commit b4b1783

File tree

4 files changed

+512
-17
lines changed

4 files changed

+512
-17
lines changed

src/query/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ mod range_query;
2222
mod regex_query;
2323
mod reqopt_scorer;
2424
mod scorer;
25-
mod set_query;
2625
mod term_query;
26+
mod term_set_query;
2727
mod union;
2828
mod weight;
2929

@@ -61,8 +61,8 @@ pub use self::regex_query::RegexQuery;
6161
pub use self::reqopt_scorer::RequiredOptionalScorer;
6262
pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner};
6363
pub use self::scorer::Scorer;
64-
pub use self::set_query::TermSetQuery;
6564
pub use self::term_query::TermQuery;
65+
pub use self::term_set_query::*;
6666
pub use self::union::{BufferedUnionScorer, SimpleUnion};
6767
#[cfg(test)]
6868
pub use self::vec_docset::VecDocSet;

src/query/term_set_query/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
mod term_set_query;
2+
mod term_set_query_fastfield;
3+
4+
pub use self::term_set_query::{InvertedIndexTermSetQuery, TermSetQuery};
5+
pub use self::term_set_query_fastfield::FastFieldTermSetQuery;

src/query/set_query.rs renamed to src/query/term_set_query/term_set_query.rs

Lines changed: 90 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@ use std::collections::HashMap;
33
use tantivy_fst::raw::CompiledAddr;
44
use tantivy_fst::{Automaton, Map};
55

6+
use super::term_set_query_fastfield::FastFieldTermSetWeight;
67
use crate::query::score_combiner::DoNothingCombiner;
78
use crate::query::{AutomatonWeight, BooleanWeight, EnableScoring, Occur, Query, Weight};
8-
use crate::schema::{Field, Schema};
9+
use crate::schema::{Field, Schema, Type};
910
use crate::{SegmentReader, Term};
1011

1112
/// A Term Set Query matches all of the documents containing any of the Term provided
@@ -44,20 +45,41 @@ impl TermSetQuery {
4445
return Err(crate::TantivyError::SchemaError(error_msg));
4546
}
4647

47-
// In practice this won't fail because:
48-
// - we are writing to memory, so no IoError
49-
// - Terms are ordered
50-
let map = Map::from_iter(
51-
sorted_terms
52-
.iter()
53-
.map(|key| (key.serialized_value_bytes(), 0)),
54-
)
55-
.map_err(std::io::Error::other)?;
56-
57-
sub_queries.push((
58-
Occur::Should,
59-
Box::new(AutomatonWeight::new(field, SetDfaWrapper(map))),
60-
));
48+
let supported_for_ff = sorted_terms
49+
.get(0)
50+
.map(|term| match term.typ() {
51+
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::IpAddr => {
52+
true
53+
}
54+
Type::Json | Type::Str => {
55+
// Explicitly not supported yet: see `term_set_query_fastfield.rs`.
56+
false
57+
}
58+
_ => false,
59+
})
60+
.unwrap_or(false);
61+
62+
if field_type.is_fast() && supported_for_ff {
63+
sub_queries.push((
64+
Occur::Should,
65+
Box::new(FastFieldTermSetWeight::new(field, sorted_terms.to_vec())),
66+
));
67+
} else {
68+
// In practice this won't fail because:
69+
// - we are writing to memory, so no IoError
70+
// - Terms are ordered
71+
let map = Map::from_iter(
72+
sorted_terms
73+
.iter()
74+
.map(|key| (key.serialized_value_bytes(), 0)),
75+
)
76+
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
77+
78+
sub_queries.push((
79+
Occur::Should,
80+
Box::new(AutomatonWeight::new(field, SetDfaWrapper(map))),
81+
));
82+
}
6183
}
6284

6385
Ok(BooleanWeight::new(
@@ -87,6 +109,59 @@ impl Query for TermSetQuery {
87109
}
88110
}
89111

112+
/// `InvertedIndexTermSetQuery` is the same as [TermSetQuery] but only uses the inverted index.
113+
#[derive(Debug, Clone)]
114+
pub struct InvertedIndexTermSetQuery {
115+
terms_map: HashMap<Field, Vec<Term>>,
116+
}
117+
118+
impl InvertedIndexTermSetQuery {
119+
/// Create a new `InvertedIndexTermSetQuery`.
120+
pub fn new<T: IntoIterator<Item = Term>>(terms: T) -> Self {
121+
let mut terms_map: HashMap<_, Vec<_>> = HashMap::new();
122+
for term in terms {
123+
terms_map.entry(term.field()).or_default().push(term);
124+
}
125+
126+
for terms in terms_map.values_mut() {
127+
terms.sort_unstable();
128+
terms.dedup();
129+
}
130+
131+
InvertedIndexTermSetQuery { terms_map }
132+
}
133+
}
134+
135+
impl Query for InvertedIndexTermSetQuery {
136+
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
137+
let mut sub_queries: Vec<(_, Box<dyn Weight>)> = Vec::with_capacity(self.terms_map.len());
138+
for (&field, sorted_terms) in &self.terms_map {
139+
let schema = enable_scoring.schema();
140+
let field_entry = schema.get_field_entry(field);
141+
if !field_entry.field_type().is_indexed() {
142+
let error_msg = format!("Field {:?} is not indexed.", field_entry.name());
143+
return Err(crate::TantivyError::SchemaError(error_msg));
144+
}
145+
let map = Map::from_iter(
146+
sorted_terms
147+
.iter()
148+
.map(|key| (key.serialized_value_bytes(), 0)),
149+
)
150+
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
151+
152+
sub_queries.push((
153+
Occur::Should,
154+
Box::new(AutomatonWeight::new(field, SetDfaWrapper(map))),
155+
));
156+
}
157+
Ok(Box::new(BooleanWeight::new(
158+
sub_queries,
159+
false,
160+
Box::new(DoNothingCombiner::default),
161+
)))
162+
}
163+
}
164+
90165
struct SetDfaWrapper(Map<Vec<u8>>);
91166

92167
impl Automaton for SetDfaWrapper {

0 commit comments

Comments
 (0)