Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions hcloud/cloud_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/syself/hetzner-cloud-controller-manager/internal/annotation"
"github.com/syself/hetzner-cloud-controller-manager/internal/credentials"
"github.com/syself/hetzner-cloud-controller-manager/internal/hcops"
robotclient "github.com/syself/hetzner-cloud-controller-manager/internal/robot/client"
hrobot "github.com/syself/hrobot-go"
"github.com/syself/hrobot-go/models"
corev1 "k8s.io/api/core/v1"
Expand All @@ -44,7 +45,7 @@ type testEnv struct {
Server *httptest.Server
Mux *http.ServeMux
Client *hcloud.Client
RobotClient hrobot.RobotClient
RobotClient robotclient.Client
}

func (env *testEnv) Teardown() {
Expand All @@ -70,7 +71,7 @@ func newTestEnv() testEnv {
Server: server,
Mux: mux,
Client: client,
RobotClient: robotClient,
RobotClient: robotclient.New(robotClient),
}
}

Expand Down
142 changes: 142 additions & 0 deletions hcloud/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/hetznercloud/hcloud-go/v2/hcloud/schema"
"github.com/syself/hetzner-cloud-controller-manager/internal/robot/client/cache"
"github.com/syself/hrobot-go/models"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -173,6 +174,147 @@ func TestInstances_InstanceExists(t *testing.T) {
}
}

func TestInstances_InstanceExistsRobotServerCreatedAfterCacheFill(t *testing.T) {
env := newTestEnv()
defer env.Teardown()

resetEnv := Setenv(t,
"ROBOT_USER_NAME", "user",
"ROBOT_PASSWORD", "pass",
"CACHE_TIMEOUT", "1h",
)
defer resetEnv()

// servers backs the Robot list response and is mutated during the test.
servers := make([]models.Server, 0, 2)
servers = append(servers, models.Server{
ServerIP: "123.123.123.123",
ServerIPv6Net: "2a01:f48:111:4221::",
ServerNumber: 321,
Name: "bm-existing",
})
env.Mux.HandleFunc("/robot/server", func(w http.ResponseWriter, _ *http.Request) {
responses := make([]models.ServerResponse, 0, len(servers))
for _, server := range servers {
responses = append(responses, models.ServerResponse{Server: server})
}
json.NewEncoder(w).Encode(responses)
})

robotClient, err := cache.NewCachedRobotClient(t.TempDir(), env.Server.Client(), env.Server.URL+"/robot")
if err != nil {
t.Fatalf("Unexpected error creating cached robot client: %v", err)
}

instances := newInstances(env.Client, robotClient, AddressFamilyIPv4, 0)

// Warm the cache while bm-new does not exist yet.
exists, err := instances.InstanceExists(context.TODO(), &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "bm-existing"},
})
if err != nil {
t.Fatalf("Unexpected error warming cache: %v", err)
}
if !exists {
t.Fatal("Expected bm-existing to exist")
}

servers = append(servers, models.Server{
ServerIP: "123.123.123.124",
ServerIPv6Net: "2a01:f48:111:4222::",
ServerNumber: 322,
Name: "bm-new",
})

exists, err = instances.InstanceExists(context.TODO(), &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "bm-new"},
})
if err != nil {
t.Fatalf("Unexpected error for bm-new: %v", err)
}
if !exists {
t.Fatal("Expected bm-new to exist after it was created")
}
}

func TestInstances_InstanceExistsRobotServerRepeatedMissingNameSkipsSecondForceRefresh(t *testing.T) {
// This test exercises the name-based Robot lookup path in getRobotServerByName:
//
// 1. ServerGetList() checks the cached Robot server list.
// 2. If the name is missing there, ServerGetListForceRefresh(node.Name) does one uncached reload.
// 3. A second lookup for the same still-missing name within CACHE_TIMEOUT must not trigger
// another uncached reload.
//
// The behavior is important because CAPH can rename Robot servers during provisioning, so the
// first miss should recover from a stale cache, but repeated misses for the same name should not
// hammer the Robot API.
env := newTestEnv()
defer env.Teardown()

resetEnv := Setenv(t,
"ROBOT_USER_NAME", "user",
"ROBOT_PASSWORD", "pass",
"CACHE_TIMEOUT", "1h",
)
defer resetEnv()

robotListHTTPCalls := 0
env.Mux.HandleFunc("/robot/server", func(w http.ResponseWriter, _ *http.Request) {
robotListHTTPCalls++
json.NewEncoder(w).Encode([]models.ServerResponse{
{
Server: models.Server{
ServerIP: "123.123.123.123",
ServerIPv6Net: "2a01:f48:111:4221::",
ServerNumber: 321,
Name: "bm-existing",
},
},
})
})

robotClient, err := cache.NewCachedRobotClient(t.TempDir(), env.Server.Client(), env.Server.URL+"/robot")
if err != nil {
t.Fatalf("Unexpected error creating cached robot client: %v", err)
}

instances := newInstances(env.Client, robotClient, AddressFamilyIPv4, 0)
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "bm-missing"},
}

// First lookup for bm-missing:
// - ServerGetList() loads the current list from Robot. That is HTTP call 1.
// - bm-missing is not present, so getRobotServerByName forces one reload. That is HTTP call 2.
exists, err := instances.InstanceExists(context.TODO(), node)
if err != nil {
t.Fatalf("Unexpected error on first miss: %v", err)
}
if exists {
t.Fatal("Expected bm-missing to be absent on first lookup")
}
if robotListHTTPCalls != 2 {
t.Fatalf("Expected 2 Robot list calls after first miss, got %d", robotListHTTPCalls)
}
callsAfterFirstMiss := robotListHTTPCalls

// Second lookup for the same missing name within CACHE_TIMEOUT:
// - ServerGetList() is served from cache, so there is no extra HTTP call.
// - ServerGetListForceRefresh(node.Name) notices that bm-missing already triggered a forced
// refresh in this cache window, so it reuses the cached list instead of issuing another HTTP
// request.
exists, err = instances.InstanceExists(context.TODO(), node)
if err != nil {
t.Fatalf("Unexpected error on second miss: %v", err)
}
if exists {
t.Fatal("Expected bm-missing to be absent on second lookup")
}
if robotListHTTPCalls != callsAfterFirstMiss {
t.Fatalf("Expected repeated miss to skip force refresh, got %d Robot list calls", robotListHTTPCalls)
}
}

func TestInstances_InstanceShutdown(t *testing.T) {
env := newTestEnv()
defer env.Teardown()
Expand Down
22 changes: 20 additions & 2 deletions hcloud/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,29 @@ func getRobotServerByName(c robotclient.Client, node *corev1.Node) (server *mode

for i, s := range serverList {
if s.Name == node.Name {
server = &serverList[i]
return &serverList[i], nil
}
}

return server, nil
// CAPH changes the Robot server name during provisioning. This means the cache of the
// server-name-to-server-ID mapping could be outdated. Force one uncached Robot list reload for
// that name to handle the rename. The Robot client suppresses repeated forced refreshes for the
// same missing name until the normal Robot list cache timeout has elapsed.
serverList, err = c.ServerGetListForceRefresh(string(node.Name))
if err != nil {
hcops.HandleRateLimitExceededError(err, node)
return nil, fmt.Errorf("%s: force refresh after cache miss: %w", op, err)
}

for i, s := range serverList {
if s.Name == node.Name {
// Server was found after cache refresh
return &serverList[i], nil
}
}

// No server found.
return nil, nil
}

func getRobotServerByID(c robotclient.Client, id int, node *corev1.Node) (s *models.Server, e error) {
Expand Down
4 changes: 4 additions & 0 deletions internal/mocks/robot.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ func (m *RobotClient) ServerGetList() ([]models.Server, error) {
return getRobotServers(args, 0), args.Error(1)
}

func (m *RobotClient) ServerGetListForceRefresh(_ string) ([]models.Server, error) {
return m.ServerGetList()
}

func (m *RobotClient) SetCredentials(_, _ string) error {
args := m.Called()
return args.Error(3)
Expand Down
26 changes: 26 additions & 0 deletions internal/robot/client/adapter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package client

import (
hrobot "github.com/syself/hrobot-go"
"github.com/syself/hrobot-go/models"
)

var _ Client = &adapter{}

type adapter struct {
hrobot.RobotClient
}

// New wraps a plain Robot client so it satisfies the local Client interface.
func New(robotClient hrobot.RobotClient) Client {
if robotClient == nil {
return nil
}
return &adapter{RobotClient: robotClient}
}

// ServerGetListForceRefresh falls back to the plain list call because the
// uncached behavior only exists in the cache-backed client implementation.
func (a *adapter) ServerGetListForceRefresh(_ string) ([]models.Server, error) {
return a.ServerGetList()
}
75 changes: 71 additions & 4 deletions internal/robot/client/cache/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ const (
robotUserNameENVVar = "ROBOT_USER_NAME"
robotPasswordENVVar = "ROBOT_PASSWORD"
cacheTimeoutENVVar = "CACHE_TIMEOUT"
defaultCacheTimeout = 5 * time.Minute
)

var _ robotclient.Client = &cacheRobotClient{}
Expand All @@ -27,10 +28,16 @@ type cacheRobotClient struct {
timeout time.Duration

lastUpdate time.Time
now func() time.Time

// cache
l []models.Server
m map[int]*models.Server

// forcedRefreshServerNames stores when a node name last triggered a forced Robot server list
// refresh. While that timestamp is still within the cache timeout window, repeated lookups for
// the same missing server name skip the extra uncached Robot API call.
forcedRefreshServerNames map[string]time.Time
}

// NewCachedRobotClient creates a new robot client with caching enabled.
Expand All @@ -51,7 +58,7 @@ func NewCachedRobotClient(rootDir string, httpClient *http.Client, baseURL strin
}

if cacheTimeout == 0 {
cacheTimeout = 5 * time.Minute
cacheTimeout = defaultCacheTimeout
}

credentialsDir := credentials.GetDirectory(rootDir)
Expand Down Expand Up @@ -85,6 +92,8 @@ func NewCachedRobotClient(rootDir string, httpClient *http.Client, baseURL strin
handler := &cacheRobotClient{}
handler.timeout = cacheTimeout
handler.robotClient = c
handler.now = time.Now
handler.forcedRefreshServerNames = make(map[string]time.Time)
return handler, nil
}

Expand All @@ -105,7 +114,7 @@ func (c *cacheRobotClient) ServerGet(id int) (*models.Server, error) {
}

// set time of last update
c.lastUpdate = time.Now()
c.lastUpdate = c.currentTime()
}

server, found := c.m[id]
Expand Down Expand Up @@ -134,30 +143,88 @@ func (c *cacheRobotClient) ServerGetList() ([]models.Server, error) {
}

// set time of last update
c.lastUpdate = time.Now()
c.lastUpdate = c.currentTime()
}

return c.l, nil
}

// ServerGetListForceRefresh invalidates the current cache and reloads the list
// from Robot unless nodeName already triggered a forced refresh within the
// current timeout window.
func (c *cacheRobotClient) ServerGetListForceRefresh(nodeName string) ([]models.Server, error) {
if nodeName != "" && c.nodeHasAlreadyForcedRefresh(nodeName) {
return c.ServerGetList()
}

// setting cache of serverId to serverName mapping to nil, so that the next ServerGetList() will
// call the robot API to get the new data.
c.m = nil

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment that you do this in order to delete the cache and to call the actual API?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

part of 57538d0


list, err := c.ServerGetList()
if err != nil {
return nil, err
}

if nodeName != "" {
c.forcedRefreshServerNames[nodeName] = c.currentTime()
}

return list, nil
}

// nodeHasAlreadyForcedRefresh reports whether nodeName already triggered a
// forced refresh within the current cache timeout window.
func (c *cacheRobotClient) nodeHasAlreadyForcedRefresh(nodeName string) bool {
if c.forcedRefreshServerNames == nil {
return false
}

forcedAt, found := c.forcedRefreshServerNames[nodeName]
if !found {
return false
}

if c.currentTime().After(forcedAt.Add(c.forceRefreshTimeout())) {
delete(c.forcedRefreshServerNames, nodeName)
return false
}

return true
}
func (c *cacheRobotClient) shouldSync() bool {
// map is nil means we have no cached value yet
if c.m == nil {
c.m = make(map[int]*models.Server)
return true
}
if time.Now().After(c.lastUpdate.Add(c.timeout)) {
if c.currentTime().After(c.lastUpdate.Add(c.timeout)) {
return true
}
return false
}

func (c *cacheRobotClient) currentTime() time.Time {
if c.now == nil {
return time.Now()
}
return c.now()
}

func (c *cacheRobotClient) forceRefreshTimeout() time.Duration {
if c.timeout == 0 {
return defaultCacheTimeout
}
return c.timeout
}

func (c *cacheRobotClient) SetCredentials(username, password string) error {
err := c.robotClient.SetCredentials(username, password)
if err != nil {
return err
}
// The credentials have been updated, so we need to invalidate the cache.
c.m = nil
c.forcedRefreshServerNames = nil
return nil
}
Loading
Loading