Skip to content

Commit 3865567

Browse files
authored
expose flag for max store gateway consistency check attempts (#6276)
1 parent 1e5b01f commit 3865567

File tree

6 files changed

+62
-26
lines changed

6 files changed

+62
-26
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* [ENHANCEMENT] Ingester: Add matchers to ingester LabelNames() and LabelNamesStream() RPC. #6209
2222
* [ENHANCEMENT] Ingester/Store Gateway Clients: Introduce an experimental HealthCheck handler to quickly fail requests directed to unhealthy targets. #6225 #6257
2323
* [ENHANCEMENT] Upgrade build image and Go version to 1.23.2. #6261 #6262
24+
* [ENHANCEMENT] Querier/Ruler: Expose `store_gateway_consistency_check_max_attempts` for max retries when querying store gateway in consistency check. #6276
2425
* [BUGFIX] Runtime-config: Handle absolute file paths when working directory is not / #6224
2526

2627
## 1.18.1 2024-10-14

docs/blocks-storage/querier.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,12 @@ querier:
226226
# CLI flag: -querier.store-gateway-query-stats-enabled
227227
[store_gateway_query_stats: <boolean> | default = true]
228228

229+
# The maximum number of times we attempt fetching missing blocks from
230+
# different store-gateways. If no more store-gateways are left (ie. due to
231+
# lower replication factor) than we'll end the retries earlier
232+
# CLI flag: -querier.store-gateway-consistency-check-max-attempts
233+
[store_gateway_consistency_check_max_attempts: <int> | default = 3]
234+
229235
# When distributor's sharding strategy is shuffle-sharding and this setting is
230236
# > 0, queriers fetch in-memory series from the minimum set of required
231237
# ingesters, selecting only ingesters which may have received series since

docs/configuration/config-file-reference.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3872,6 +3872,12 @@ store_gateway_client:
38723872
# CLI flag: -querier.store-gateway-query-stats-enabled
38733873
[store_gateway_query_stats: <boolean> | default = true]
38743874

3875+
# The maximum number of times we attempt fetching missing blocks from different
3876+
# store-gateways. If no more store-gateways are left (ie. due to lower
3877+
# replication factor) than we'll end the retries earlier
3878+
# CLI flag: -querier.store-gateway-consistency-check-max-attempts
3879+
[store_gateway_consistency_check_max_attempts: <int> | default = 3]
3880+
38753881
# When distributor's sharding strategy is shuffle-sharding and this setting is >
38763882
# 0, queriers fetch in-memory series from the minimum set of required ingesters,
38773883
# selecting only ingesters which may have received series since 'now - lookback

pkg/querier/blocks_store_queryable.go

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ type BlocksStoreQueryable struct {
141141
metrics *blocksStoreQueryableMetrics
142142
limits BlocksStoreLimits
143143

144-
storeGatewayQueryStatsEnabled bool
144+
storeGatewayQueryStatsEnabled bool
145+
storeGatewayConsistencyCheckMaxAttempts int
145146

146147
// Subservices manager.
147148
subservices *services.Manager
@@ -153,8 +154,7 @@ func NewBlocksStoreQueryable(
153154
finder BlocksFinder,
154155
consistency *BlocksConsistencyChecker,
155156
limits BlocksStoreLimits,
156-
queryStoreAfter time.Duration,
157-
storeGatewayQueryStatsEnabled bool,
157+
config Config,
158158
logger log.Logger,
159159
reg prometheus.Registerer,
160160
) (*BlocksStoreQueryable, error) {
@@ -164,16 +164,17 @@ func NewBlocksStoreQueryable(
164164
}
165165

166166
q := &BlocksStoreQueryable{
167-
stores: stores,
168-
finder: finder,
169-
consistency: consistency,
170-
queryStoreAfter: queryStoreAfter,
171-
logger: logger,
172-
subservices: manager,
173-
subservicesWatcher: services.NewFailureWatcher(),
174-
metrics: newBlocksStoreQueryableMetrics(reg),
175-
limits: limits,
176-
storeGatewayQueryStatsEnabled: storeGatewayQueryStatsEnabled,
167+
stores: stores,
168+
finder: finder,
169+
consistency: consistency,
170+
queryStoreAfter: config.QueryStoreAfter,
171+
logger: logger,
172+
subservices: manager,
173+
subservicesWatcher: services.NewFailureWatcher(),
174+
metrics: newBlocksStoreQueryableMetrics(reg),
175+
limits: limits,
176+
storeGatewayQueryStatsEnabled: config.StoreGatewayQueryStatsEnabled,
177+
storeGatewayConsistencyCheckMaxAttempts: config.StoreGatewayConsistencyCheckMaxAttempts,
177178
}
178179

179180
q.Service = services.NewBasicService(q.starting, q.running, q.stopping)
@@ -264,7 +265,7 @@ func NewBlocksStoreQueryableFromConfig(querierCfg Config, gatewayCfg storegatewa
264265
reg,
265266
)
266267

267-
return NewBlocksStoreQueryable(stores, finder, consistency, limits, querierCfg.QueryStoreAfter, querierCfg.StoreGatewayQueryStatsEnabled, logger, reg)
268+
return NewBlocksStoreQueryable(stores, finder, consistency, limits, querierCfg, logger, reg)
268269
}
269270

270271
func (q *BlocksStoreQueryable) starting(ctx context.Context) error {
@@ -299,16 +300,17 @@ func (q *BlocksStoreQueryable) Querier(mint, maxt int64) (storage.Querier, error
299300
}
300301

301302
return &blocksStoreQuerier{
302-
minT: mint,
303-
maxT: maxt,
304-
finder: q.finder,
305-
stores: q.stores,
306-
metrics: q.metrics,
307-
limits: q.limits,
308-
consistency: q.consistency,
309-
logger: q.logger,
310-
queryStoreAfter: q.queryStoreAfter,
311-
storeGatewayQueryStatsEnabled: q.storeGatewayQueryStatsEnabled,
303+
minT: mint,
304+
maxT: maxt,
305+
finder: q.finder,
306+
stores: q.stores,
307+
metrics: q.metrics,
308+
limits: q.limits,
309+
consistency: q.consistency,
310+
logger: q.logger,
311+
queryStoreAfter: q.queryStoreAfter,
312+
storeGatewayQueryStatsEnabled: q.storeGatewayQueryStatsEnabled,
313+
storeGatewayConsistencyCheckMaxAttempts: q.storeGatewayConsistencyCheckMaxAttempts,
312314
}, nil
313315
}
314316

@@ -328,6 +330,9 @@ type blocksStoreQuerier struct {
328330
// If enabled, query stats of store gateway requests will be logged
329331
// using `info` level.
330332
storeGatewayQueryStatsEnabled bool
333+
334+
// The maximum number of times we attempt fetching missing blocks from different Store Gateways.
335+
storeGatewayConsistencyCheckMaxAttempts int
331336
}
332337

333338
// Select implements storage.Querier interface.
@@ -534,7 +539,7 @@ func (q *blocksStoreQuerier) queryWithConsistencyCheck(ctx context.Context, logg
534539
retryableError error
535540
)
536541

537-
for attempt := 1; attempt <= maxFetchSeriesAttempts; attempt++ {
542+
for attempt := 1; attempt <= q.storeGatewayConsistencyCheckMaxAttempts; attempt++ {
538543
// Find the set of store-gateway instances having the blocks. The exclude parameter is the
539544
// map of blocks queried so far, with the list of store-gateway addresses for each block.
540545
clients, err := q.stores.GetClientsFor(userID, remainingBlocks, attemptedBlocks, attemptedBlocksZones)

pkg/querier/blocks_store_queryable_test.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1552,6 +1552,8 @@ func TestBlocksStoreQuerier_Select(t *testing.T) {
15521552
logger: log.NewNopLogger(),
15531553
metrics: newBlocksStoreQueryableMetrics(reg),
15541554
limits: testData.limits,
1555+
1556+
storeGatewayConsistencyCheckMaxAttempts: 3,
15551557
}
15561558

15571559
matchers := []*labels.Matcher{
@@ -2148,6 +2150,8 @@ func TestBlocksStoreQuerier_Labels(t *testing.T) {
21482150
logger: log.NewNopLogger(),
21492151
metrics: newBlocksStoreQueryableMetrics(reg),
21502152
limits: &blocksStoreLimitsMock{},
2153+
2154+
storeGatewayConsistencyCheckMaxAttempts: 3,
21512155
}
21522156

21532157
if testFunc == "LabelNames" {
@@ -2371,7 +2375,12 @@ func TestBlocksStoreQuerier_PromQLExecution(t *testing.T) {
23712375
}
23722376

23732377
// Instance the querier that will be executed to run the query.
2374-
queryable, err := NewBlocksStoreQueryable(stores, finder, NewBlocksConsistencyChecker(0, 0, logger, nil), &blocksStoreLimitsMock{}, 0, false, logger, nil)
2378+
cfg := Config{
2379+
QueryStoreAfter: 0,
2380+
StoreGatewayQueryStatsEnabled: false,
2381+
StoreGatewayConsistencyCheckMaxAttempts: 3,
2382+
}
2383+
queryable, err := NewBlocksStoreQueryable(stores, finder, NewBlocksConsistencyChecker(0, 0, logger, nil), &blocksStoreLimitsMock{}, cfg, logger, nil)
23752384
require.NoError(t, err)
23762385
require.NoError(t, services.StartAndAwaitRunning(context.Background(), queryable))
23772386
defer services.StopAndAwaitTerminated(context.Background(), queryable) // nolint:errcheck

pkg/querier/querier.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ type Config struct {
7979
StoreGatewayClient ClientConfig `yaml:"store_gateway_client"`
8080
StoreGatewayQueryStatsEnabled bool `yaml:"store_gateway_query_stats"`
8181

82+
// The maximum number of times we attempt fetching missing blocks from different Store Gateways.
83+
StoreGatewayConsistencyCheckMaxAttempts int `yaml:"store_gateway_consistency_check_max_attempts"`
84+
8285
ShuffleShardingIngestersLookbackPeriod time.Duration `yaml:"shuffle_sharding_ingesters_lookback_period"`
8386

8487
// Experimental. Use https://github.com/thanos-io/promql-engine rather than
@@ -94,6 +97,7 @@ var (
9497
errShuffleShardingLookbackLessThanQueryStoreAfter = errors.New("the shuffle-sharding lookback period should be greater or equal than the configured 'query store after'")
9598
errEmptyTimeRange = errors.New("empty time range")
9699
errUnsupportedResponseCompression = errors.New("unsupported response compression. Supported compression 'gzip' and '' (disable compression)")
100+
errInvalidConsistencyCheckAttempts = errors.New("store gateway consistency check max attempts should be greater or equal than 1")
97101
)
98102

99103
// RegisterFlags adds the flags required to config this to the given FlagSet.
@@ -122,6 +126,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
122126
f.StringVar(&cfg.ActiveQueryTrackerDir, "querier.active-query-tracker-dir", "./active-query-tracker", "Active query tracker monitors active queries, and writes them to the file in given directory. If Cortex discovers any queries in this log during startup, it will log them to the log file. Setting to empty value disables active query tracker, which also disables -querier.max-concurrent option.")
123127
f.StringVar(&cfg.StoreGatewayAddresses, "querier.store-gateway-addresses", "", "Comma separated list of store-gateway addresses in DNS Service Discovery format. This option should be set when using the blocks storage and the store-gateway sharding is disabled (when enabled, the store-gateway instances form a ring and addresses are picked from the ring).")
124128
f.BoolVar(&cfg.StoreGatewayQueryStatsEnabled, "querier.store-gateway-query-stats-enabled", true, "If enabled, store gateway query stats will be logged using `info` log level.")
129+
f.IntVar(&cfg.StoreGatewayConsistencyCheckMaxAttempts, "querier.store-gateway-consistency-check-max-attempts", maxFetchSeriesAttempts, "The maximum number of times we attempt fetching missing blocks from different store-gateways. If no more store-gateways are left (ie. due to lower replication factor) than we'll end the retries earlier")
125130
f.DurationVar(&cfg.LookbackDelta, "querier.lookback-delta", 5*time.Minute, "Time since the last sample after which a time series is considered stale and ignored by expression evaluations.")
126131
f.DurationVar(&cfg.ShuffleShardingIngestersLookbackPeriod, "querier.shuffle-sharding-ingesters-lookback-period", 0, "When distributor's sharding strategy is shuffle-sharding and this setting is > 0, queriers fetch in-memory series from the minimum set of required ingesters, selecting only ingesters which may have received series since 'now - lookback period'. The lookback period should be greater or equal than the configured 'query store after' and 'query ingesters within'. If this setting is 0, queriers always query all ingesters (ingesters shuffle sharding on read path is disabled).")
127132
f.BoolVar(&cfg.ThanosEngine, "querier.thanos-engine", false, "Experimental. Use Thanos promql engine https://github.com/thanos-io/promql-engine rather than the Prometheus promql engine.")
@@ -148,6 +153,10 @@ func (cfg *Config) Validate() error {
148153
}
149154
}
150155

156+
if cfg.StoreGatewayConsistencyCheckMaxAttempts < 1 {
157+
return errInvalidConsistencyCheckAttempts
158+
}
159+
151160
return nil
152161
}
153162

0 commit comments

Comments
 (0)