Skip to content

Commit 839aa2b

Browse files
authored
Optimize sparse vector stats collection (#128740) (#128771)
This change improves the performance of sparse vector statistics gathering by using the document count of terms directly, rather than relying on the field name field to compute stats. By avoiding per-term disk/network reads and instead leveraging statistics already loaded into leaf readers at index opening, we expect to significantly reduce overhead. Relates to #128583
1 parent c430b02 commit 839aa2b

File tree

2 files changed

+14
-10
lines changed

2 files changed

+14
-10
lines changed

docs/changelog/128740.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 128740
2+
summary: Optimize sparse vector stats collection
3+
area: Stats
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/index/engine/Engine.java

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import org.apache.lucene.index.SegmentInfos;
2626
import org.apache.lucene.index.SegmentReader;
2727
import org.apache.lucene.index.Terms;
28-
import org.apache.lucene.index.TermsEnum;
2928
import org.apache.lucene.search.IndexSearcher;
3029
import org.apache.lucene.search.QueryCache;
3130
import org.apache.lucene.search.QueryCachingPolicy;
@@ -61,7 +60,6 @@
6160
import org.elasticsearch.index.IndexVersion;
6261
import org.elasticsearch.index.VersionType;
6362
import org.elasticsearch.index.mapper.DocumentParser;
64-
import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
6563
import org.elasticsearch.index.mapper.LuceneDocument;
6664
import org.elasticsearch.index.mapper.Mapper;
6765
import org.elasticsearch.index.mapper.Mapping;
@@ -337,14 +335,15 @@ protected final SparseVectorStats sparseVectorStats(IndexReader indexReader, Lis
337335

338336
private long getSparseVectorValueCount(final LeafReader atomicReader, List<BytesRef> fields) throws IOException {
339337
long count = 0;
340-
Terms terms = atomicReader.terms(FieldNamesFieldMapper.NAME);
341-
if (terms == null) {
342-
return count;
343-
}
344-
TermsEnum termsEnum = terms.iterator();
345-
for (var fieldName : fields) {
346-
if (termsEnum.seekExact(fieldName)) {
347-
count += termsEnum.docFreq();
338+
for (var fieldNameBR : fields) {
339+
var fieldName = fieldNameBR.utf8ToString();
340+
var fi = atomicReader.getFieldInfos().fieldInfo(fieldName);
341+
if (fi == null) {
342+
continue;
343+
}
344+
Terms terms = atomicReader.terms(fieldName);
345+
if (terms != null) {
346+
count += terms.getDocCount();
348347
}
349348
}
350349
return count;

0 commit comments

Comments
 (0)