Skip to content

Commit e7565b1

Browse files
authored
Optimize sparse vector stats collection (#128740)
This change improves the performance of sparse vector statistics gathering by using the document count of terms directly, rather than relying on the field name field to compute stats. By avoiding per-term disk/network reads and instead leveraging statistics already loaded into leaf readers at index opening, we expect to significantly reduce overhead. Relates to #128583
1 parent 041c42a commit e7565b1

File tree

2 files changed

+14
-10
lines changed

2 files changed

+14
-10
lines changed

docs/changelog/128740.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 128740
2+
summary: Optimize sparse vector stats collection
3+
area: Stats
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/index/engine/Engine.java

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.apache.lucene.index.SegmentInfos;
2727
import org.apache.lucene.index.SegmentReader;
2828
import org.apache.lucene.index.Terms;
29-
import org.apache.lucene.index.TermsEnum;
3029
import org.apache.lucene.search.IndexSearcher;
3130
import org.apache.lucene.search.QueryCache;
3231
import org.apache.lucene.search.QueryCachingPolicy;
@@ -65,7 +64,6 @@
6564
import org.elasticsearch.index.codec.FieldInfosWithUsages;
6665
import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils;
6766
import org.elasticsearch.index.mapper.DocumentParser;
68-
import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
6967
import org.elasticsearch.index.mapper.LuceneDocument;
7068
import org.elasticsearch.index.mapper.Mapper;
7169
import org.elasticsearch.index.mapper.Mapping;
@@ -385,14 +383,15 @@ protected final SparseVectorStats sparseVectorStats(IndexReader indexReader, Lis
385383

386384
private long getSparseVectorValueCount(final LeafReader atomicReader, List<BytesRef> fields) throws IOException {
387385
long count = 0;
388-
Terms terms = atomicReader.terms(FieldNamesFieldMapper.NAME);
389-
if (terms == null) {
390-
return count;
391-
}
392-
TermsEnum termsEnum = terms.iterator();
393-
for (var fieldName : fields) {
394-
if (termsEnum.seekExact(fieldName)) {
395-
count += termsEnum.docFreq();
386+
for (var fieldNameBR : fields) {
387+
var fieldName = fieldNameBR.utf8ToString();
388+
var fi = atomicReader.getFieldInfos().fieldInfo(fieldName);
389+
if (fi == null) {
390+
continue;
391+
}
392+
Terms terms = atomicReader.terms(fieldName);
393+
if (terms != null) {
394+
count += terms.getDocCount();
396395
}
397396
}
398397
return count;

0 commit comments

Comments
 (0)