1
1
// Copyright (c) Zefchain Labs, Inc.
2
2
// SPDX-License-Identifier: Apache-2.0
3
3
4
- use std:: { fmt, future:: Future , iter} ;
4
+ use std:: { fmt, future:: Future , iter, sync :: Arc } ;
5
5
6
6
use futures:: { future, stream, StreamExt } ;
7
7
use linera_base:: {
@@ -29,6 +29,7 @@ use tracing::{debug, info, instrument, warn, Level};
29
29
30
30
use super :: {
31
31
api:: { self , validator_node_client:: ValidatorNodeClient , SubscriptionRequest } ,
32
+ pool:: GrpcConnectionPool ,
32
33
transport, GRPC_MAX_MESSAGE_SIZE ,
33
34
} ;
34
35
use crate :: {
@@ -39,57 +40,74 @@ use crate::{
39
40
#[ derive( Clone ) ]
40
41
pub struct GrpcClient {
41
42
address : String ,
42
- client : ValidatorNodeClient < transport :: Channel > ,
43
+ pool : Arc < GrpcConnectionPool > ,
43
44
retry_delay : Duration ,
44
45
max_retries : u32 ,
45
46
}
46
47
47
48
impl GrpcClient {
48
49
pub fn new (
49
50
address : String ,
50
- channel : transport :: Channel ,
51
+ pool : Arc < GrpcConnectionPool > ,
51
52
retry_delay : Duration ,
52
53
max_retries : u32 ,
53
- ) -> Self {
54
- let client = ValidatorNodeClient :: new ( channel)
55
- . max_encoding_message_size ( GRPC_MAX_MESSAGE_SIZE )
56
- . max_decoding_message_size ( GRPC_MAX_MESSAGE_SIZE ) ;
57
- Self {
54
+ ) -> Result < Self , super :: GrpcError > {
55
+ // Just verify we can get a channel to this address
56
+ let _ = pool. channel ( address. clone ( ) ) ?;
57
+ Ok ( Self {
58
58
address,
59
- client ,
59
+ pool ,
60
60
retry_delay,
61
61
max_retries,
62
- }
62
+ } )
63
63
}
64
64
65
65
pub fn address ( & self ) -> & str {
66
66
& self . address
67
67
}
68
68
69
+ fn make_client ( & self ) -> Result < ValidatorNodeClient < transport:: Channel > , super :: GrpcError > {
70
+ let channel = self . pool . channel ( self . address . clone ( ) ) ?;
71
+ Ok ( ValidatorNodeClient :: new ( channel)
72
+ . max_encoding_message_size ( GRPC_MAX_MESSAGE_SIZE )
73
+ . max_decoding_message_size ( GRPC_MAX_MESSAGE_SIZE ) )
74
+ }
75
+
69
76
/// Returns whether this gRPC status means the server stream should be reconnected to, or not.
70
77
/// Logs a warning on unexpected status codes.
71
- fn is_retryable ( status : & Status ) -> bool {
78
+ fn is_retryable_needs_reconnect ( status : & Status ) -> ( bool , bool ) {
72
79
match status. code ( ) {
73
80
Code :: DeadlineExceeded | Code :: Aborted | Code :: Unavailable | Code :: Unknown => {
74
81
info ! ( "gRPC request interrupted: {}; retrying" , status) ;
75
- true
82
+ ( true , false )
76
83
}
77
84
Code :: Ok | Code :: Cancelled | Code :: ResourceExhausted => {
78
85
info ! ( "Unexpected gRPC status: {}; retrying" , status) ;
79
- true
86
+ ( true , false )
87
+ }
88
+ Code :: NotFound => ( false , false ) , // This code is used if e.g. the validator is missing blobs.
89
+ Code :: Internal => {
90
+ let error_string = status. to_string ( ) ;
91
+ if error_string. contains ( "GoAway" ) && error_string. contains ( "max_age" ) {
92
+ info ! (
93
+ "gRPC connection hit max_age and got a GoAway: {}; reconnecting then retrying" ,
94
+ status
95
+ ) ;
96
+ return ( true , true ) ;
97
+ }
98
+ info ! ( "Unexpected gRPC status: {}" , status) ;
99
+ ( false , false )
80
100
}
81
- Code :: NotFound => false , // This code is used if e.g. the validator is missing blobs.
82
101
Code :: InvalidArgument
83
102
| Code :: AlreadyExists
84
103
| Code :: PermissionDenied
85
104
| Code :: FailedPrecondition
86
105
| Code :: OutOfRange
87
106
| Code :: Unimplemented
88
- | Code :: Internal
89
107
| Code :: DataLoss
90
108
| Code :: Unauthenticated => {
91
109
info ! ( "Unexpected gRPC status: {}" , status) ;
92
- false
110
+ ( false , false )
93
111
}
94
112
}
95
113
}
@@ -109,15 +127,36 @@ impl GrpcClient {
109
127
let request_inner = request. try_into ( ) . map_err ( |_| NodeError :: GrpcError {
110
128
error : "could not convert request to proto" . to_string ( ) ,
111
129
} ) ?;
130
+
131
+ let mut reconnected = false ;
112
132
loop {
113
- match f ( self . client . clone ( ) , Request :: new ( request_inner. clone ( ) ) ) . await {
114
- Err ( s) if Self :: is_retryable ( & s) && retry_count < self . max_retries => {
115
- let delay = self . retry_delay . saturating_mul ( retry_count) ;
116
- retry_count += 1 ;
117
- linera_base:: time:: timer:: sleep ( delay) . await ;
118
- continue ;
133
+ // Create client on-demand for each attempt
134
+ let client = match self . make_client ( ) {
135
+ Ok ( client) => client,
136
+ Err ( e) => {
137
+ return Err ( NodeError :: GrpcError {
138
+ error : format ! ( "Failed to create client: {}" , e) ,
139
+ } ) ;
119
140
}
141
+ } ;
142
+
143
+ match f ( client, Request :: new ( request_inner. clone ( ) ) ) . await {
120
144
Err ( s) => {
145
+ let ( is_retryable, needs_reconnect) = Self :: is_retryable_needs_reconnect ( & s) ;
146
+ if is_retryable && retry_count < self . max_retries {
147
+ // If this error indicates we need a connection refresh and we haven't already tried, do it
148
+ if needs_reconnect && !reconnected {
149
+ info ! ( "Connection error detected, invalidating channel: {}" , s) ;
150
+ self . pool . invalidate_channel ( & self . address ) ;
151
+ reconnected = true ;
152
+ }
153
+
154
+ let delay = self . retry_delay . saturating_mul ( retry_count) ;
155
+ retry_count += 1 ;
156
+ linera_base:: time:: timer:: sleep ( delay) . await ;
157
+ continue ;
158
+ }
159
+
121
160
return Err ( NodeError :: GrpcError {
122
161
error : format ! ( "remote request [{handler}] failed with status: {s:?}" ) ,
123
162
} ) ;
@@ -270,32 +309,56 @@ impl ValidatorNode for GrpcClient {
270
309
let subscription_request = SubscriptionRequest {
271
310
chain_ids : chains. into_iter ( ) . map ( |chain| chain. into ( ) ) . collect ( ) ,
272
311
} ;
273
- let mut client = self . client . clone ( ) ;
312
+ let pool = self . pool . clone ( ) ;
313
+ let address = self . address . clone ( ) ;
274
314
275
315
// Make the first connection attempt before returning from this method.
276
- let mut stream = Some (
316
+ let mut stream = Some ( {
317
+ let mut client = self
318
+ . make_client ( )
319
+ . map_err ( |e| NodeError :: SubscriptionFailed {
320
+ status : format ! ( "Failed to create client: {}" , e) ,
321
+ } ) ?;
277
322
client
278
323
. subscribe ( subscription_request. clone ( ) )
279
324
. await
280
325
. map_err ( |status| NodeError :: SubscriptionFailed {
281
326
status : status. to_string ( ) ,
282
327
} ) ?
283
- . into_inner ( ) ,
284
- ) ;
328
+ . into_inner ( )
329
+ } ) ;
285
330
286
331
// A stream of `Result<grpc::Notification, tonic::Status>` that keeps calling
287
332
// `client.subscribe(request)` endlessly and without delay.
288
333
let endlessly_retrying_notification_stream = stream:: unfold ( ( ) , move |( ) | {
289
- let mut client = client. clone ( ) ;
334
+ let pool = pool. clone ( ) ;
335
+ let address = address. clone ( ) ;
290
336
let subscription_request = subscription_request. clone ( ) ;
291
337
let mut stream = stream. take ( ) ;
292
338
async move {
293
339
let stream = if let Some ( stream) = stream. take ( ) {
294
340
future:: Either :: Right ( stream)
295
341
} else {
296
- match client. subscribe ( subscription_request. clone ( ) ) . await {
297
- Err ( err) => future:: Either :: Left ( stream:: iter ( iter:: once ( Err ( err) ) ) ) ,
298
- Ok ( response) => future:: Either :: Right ( response. into_inner ( ) ) ,
342
+ // Create a new client for each reconnection attempt
343
+ match pool. channel ( address. clone ( ) ) {
344
+ Ok ( channel) => {
345
+ let mut client = ValidatorNodeClient :: new ( channel)
346
+ . max_encoding_message_size ( GRPC_MAX_MESSAGE_SIZE )
347
+ . max_decoding_message_size ( GRPC_MAX_MESSAGE_SIZE ) ;
348
+ match client. subscribe ( subscription_request. clone ( ) ) . await {
349
+ Err ( err) => {
350
+ future:: Either :: Left ( stream:: iter ( iter:: once ( Err ( err) ) ) )
351
+ }
352
+ Ok ( response) => future:: Either :: Right ( response. into_inner ( ) ) ,
353
+ }
354
+ }
355
+ Err ( e) => {
356
+ let status = tonic:: Status :: unavailable ( format ! (
357
+ "Failed to create channel: {}" ,
358
+ e
359
+ ) ) ;
360
+ future:: Either :: Left ( stream:: iter ( iter:: once ( Err ( status) ) ) )
361
+ }
299
362
}
300
363
} ;
301
364
Some ( ( stream, ( ) ) )
@@ -319,7 +382,9 @@ impl ValidatorNode for GrpcClient {
319
382
return future:: Either :: Left ( future:: ready ( true ) ) ;
320
383
} ;
321
384
322
- if !span. in_scope ( || Self :: is_retryable ( status) ) || retry_count >= max_retries {
385
+ let ( is_retryable, _) =
386
+ span. in_scope ( || Self :: is_retryable_needs_reconnect ( status) ) ;
387
+ if !is_retryable || retry_count >= max_retries {
323
388
return future:: Either :: Left ( future:: ready ( false ) ) ;
324
389
}
325
390
let delay = retry_delay. saturating_mul ( retry_count) ;
0 commit comments