Skip to content
This repository was archived by the owner on Mar 9, 2022. It is now read-only.

Commit 8845f14

Browse files
authored
Merge pull request #797 from Random-Liu/cherrypick-#794
Generate fatal error when cri plugin fail to start.
2 parents 4b0acec + 5b39aca commit 8845f14

File tree

2 files changed

+38
-22
lines changed

2 files changed

+38
-22
lines changed

pkg/server/events.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,16 @@ func convertEvent(e *gogotypes.Any) (string, interface{}, error) {
116116
}
117117

118118
// start starts the event monitor which monitors and handles all container events. It returns
119-
// a channel for the caller to wait for the event monitor to stop. start must be called after
120-
// subscribe.
121-
func (em *eventMonitor) start() (<-chan struct{}, error) {
119+
// an error channel for the caller to wait for stop errors from the event monitor.
120+
// start must be called after subscribe.
121+
func (em *eventMonitor) start() <-chan error {
122+
errCh := make(chan error)
122123
if em.ch == nil || em.errCh == nil {
123-
return nil, errors.New("event channel is nil")
124+
panic("event channel is nil")
124125
}
125-
closeCh := make(chan struct{})
126126
backOffCheckCh := em.backOff.start()
127127
go func() {
128+
defer close(errCh)
128129
for {
129130
select {
130131
case e := <-em.ch:
@@ -144,8 +145,11 @@ func (em *eventMonitor) start() (<-chan struct{}, error) {
144145
em.backOff.enBackOff(cID, evt)
145146
}
146147
case err := <-em.errCh:
147-
logrus.WithError(err).Error("Failed to handle event stream")
148-
close(closeCh)
148+
// Close errCh in defer directly if there is no error.
149+
if err != nil {
150+
logrus.WithError(err).Errorf("Failed to handle event stream")
151+
errCh <- err
152+
}
149153
return
150154
case <-backOffCheckCh:
151155
cIDs := em.backOff.getExpiredContainers()
@@ -162,7 +166,7 @@ func (em *eventMonitor) start() (<-chan struct{}, error) {
162166
}
163167
}
164168
}()
165-
return closeCh, nil
169+
return errCh
166170
}
167171

168172
// stop stops the event monitor. It will close the event channel.

pkg/server/service.go

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package server
1919
import (
2020
"fmt"
2121
"io"
22+
"net/http"
2223
"path/filepath"
2324
"time"
2425

@@ -179,10 +180,7 @@ func (c *criService) Run() error {
179180

180181
// Start event handler.
181182
logrus.Info("Start event monitor")
182-
eventMonitorCloseCh, err := c.eventMonitor.start()
183-
if err != nil {
184-
return errors.Wrap(err, "failed to start event monitor")
185-
}
183+
eventMonitorErrCh := c.eventMonitor.start()
186184

187185
// Start snapshot stats syncer, it doesn't need to be stopped.
188186
logrus.Info("Start snapshots syncer")
@@ -195,27 +193,32 @@ func (c *criService) Run() error {
195193

196194
// Start streaming server.
197195
logrus.Info("Start streaming server")
198-
streamServerCloseCh := make(chan struct{})
196+
streamServerErrCh := make(chan error)
199197
go func() {
200-
if err := c.streamServer.Start(true); err != nil {
198+
defer close(streamServerErrCh)
199+
if err := c.streamServer.Start(true); err != nil && err != http.ErrServerClosed {
201200
logrus.WithError(err).Error("Failed to start streaming server")
201+
streamServerErrCh <- err
202202
}
203-
close(streamServerCloseCh)
204203
}()
205204

206205
// Set the server as initialized. GRPC services could start serving traffic.
207206
c.initialized.Set()
208207

208+
var eventMonitorErr, streamServerErr error
209209
// Stop the whole CRI service if any of the critical service exits.
210210
select {
211-
case <-eventMonitorCloseCh:
212-
case <-streamServerCloseCh:
211+
case eventMonitorErr = <-eventMonitorErrCh:
212+
case streamServerErr = <-streamServerErrCh:
213213
}
214214
if err := c.Close(); err != nil {
215215
return errors.Wrap(err, "failed to stop cri service")
216216
}
217-
218-
<-eventMonitorCloseCh
217+
// If the error is set above, err from channel must be nil here, because
218+
// the channel is supposed to be closed. Or else, we wait and set it.
219+
if err := <-eventMonitorErrCh; err != nil {
220+
eventMonitorErr = err
221+
}
219222
logrus.Info("Event monitor stopped")
220223
// There is a race condition with http.Server.Serve.
221224
// When `Close` is called at the same time with `Serve`, `Close`
@@ -227,18 +230,27 @@ func (c *criService) Run() error {
227230
// is fixed.
228231
const streamServerStopTimeout = 2 * time.Second
229232
select {
230-
case <-streamServerCloseCh:
233+
case err := <-streamServerErrCh:
234+
if err != nil {
235+
streamServerErr = err
236+
}
231237
logrus.Info("Stream server stopped")
232238
case <-time.After(streamServerStopTimeout):
233239
logrus.Errorf("Stream server is not stopped in %q", streamServerStopTimeout)
234240
}
241+
if eventMonitorErr != nil {
242+
return errors.Wrap(eventMonitorErr, "event monitor error")
243+
}
244+
if streamServerErr != nil {
245+
return errors.Wrap(streamServerErr, "stream server error")
246+
}
235247
return nil
236248
}
237249

238-
// Stop stops the CRI service.
250+
// Close stops the CRI service.
251+
// TODO(random-liu): Make close synchronous.
239252
func (c *criService) Close() error {
240253
logrus.Info("Stop CRI service")
241-
// TODO(random-liu): Make event monitor stop synchronous.
242254
c.eventMonitor.stop()
243255
if err := c.streamServer.Stop(); err != nil {
244256
return errors.Wrap(err, "failed to stop stream server")

0 commit comments

Comments
 (0)