diff --git a/.changelog/27161.txt b/.changelog/27161.txt new file mode 100644 index 00000000000..400f8649210 --- /dev/null +++ b/.changelog/27161.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Added a new `fingerprint` configuration block which allows users to specify retry behavior for the `env_aws`, `env_azure`, `env_digitalocean` and `env_gcp` fingerprinters. +``` diff --git a/client/config/config.go b/client/config/config.go index 5b6525d1803..1d8083f1c86 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -395,6 +395,10 @@ type Config struct { // LogFile is used by MonitorExport to stream a server's log file LogFile string `hcl:"log_file"` + + // Fingerprinters is a map of fingerprinter configurations by name. This + // currently only applies to env fingerprinters such as "env_aws". + Fingerprinters map[string]*Fingerprint } type APIListenerRegistrar interface { @@ -931,6 +935,7 @@ func DefaultConfig() *Config { MinDynamicUser: 80_000, MaxDynamicUser: 89_999, }, + Fingerprinters: map[string]*Fingerprint{}, } return cfg diff --git a/client/config/fingerprint.go b/client/config/fingerprint.go new file mode 100644 index 00000000000..8cec29b5240 --- /dev/null +++ b/client/config/fingerprint.go @@ -0,0 +1,120 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package config + +import ( + "errors" + "fmt" + "slices" + "strings" + "time" +) + +// validEnvFingerprinters contains the fingerprinters that are valid +// environment fingerprinters and is used for input validation. +var validEnvFingerprinters = []string{ + "env_aws", + "env_azure", + "env_gce", + "env_digitalocean", +} + +// Fingerprint is an optional configuration block for environment fingerprinters +// can control retry behavior and failure handling. +type Fingerprint struct { + + // Name is the fingerprinter identifier that this configuration block + // relates to. It is gathered from the HCL block label. + Name string `hcl:",key"` + + // RetryInterval specifies the time to wait between fingerprint + // attempts. + RetryInterval time.Duration + RetryIntervalHCL string `hcl:"retry_interval,optional"` + + // RetryAttempts specifies the maximum number of fingerprint attempts to be + // made before the failure is considered terminal. + RetryAttempts int `hcl:"retry_attempts,optional"` + + // ExitOnFailure indicates whether the fingerprinter should cause the agent + // to exit if it fails to correctly perform its fingerprint run. This is + // useful if the fingerprinter provides critical information used by Nomad + // workloads. + ExitOnFailure *bool `hcl:"exit_on_failure,optional"` + + // ExtraKeysHCL is used by hcl to surface unexpected keys + ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"` +} + +// Copy is used to satisfy to helper.Copyable interface, so we can perform +// copies of the fingerprint config slice. +func (f *Fingerprint) Copy() *Fingerprint { + if f == nil { + return nil + } + + c := new(Fingerprint) + *c = *f + return c +} + +// Merge is used to combine two fingerprint blocks with the block passed into +// the function taking precedence. The name is not overwritten as this is +// expected to match as it's the block label. It is the callers responsibility +// to ensure the two fingerprint blocks are for the same fingerprinter +// implementation. +func (f *Fingerprint) Merge(z *Fingerprint) *Fingerprint { + if f == nil { + return z + } + + result := *f + + if z == nil { + return &result + } + + if z.RetryInterval != 0 { + result.RetryInterval = z.RetryInterval + } + if z.RetryIntervalHCL != "" { + result.RetryIntervalHCL = z.RetryIntervalHCL + } + if z.RetryAttempts != 0 { + result.RetryAttempts = z.RetryAttempts + } + if z.ExitOnFailure != nil { + result.ExitOnFailure = z.ExitOnFailure + } + + return &result +} + +// Validate the fingerprint block to ensure we do not have any values that +// cannot be handled. +func (f *Fingerprint) Validate() error { + + if f == nil { + return nil + } + + if f.Name == "" { + return errors.New("fingerprint name cannot be empty") + } + if !slices.Contains(validEnvFingerprinters, f.Name) { + return fmt.Errorf("fingerprint %q does not support configuration", f.Name) + } + if f.RetryInterval < 0 { + return fmt.Errorf("fingerprint %q retry interval cannot be negative", f.Name) + } + if f.RetryAttempts < -1 { + return fmt.Errorf("fingerprint %q retry attempts cannot be less than -1", f.Name) + } + if len(f.ExtraKeysHCL) > 0 { + return fmt.Errorf("fingerprint %q contains unknown configuration options: %s", + f.Name, strings.Join(f.ExtraKeysHCL, ",")) + } + + return nil +} diff --git a/client/config/fingerprint_test.go b/client/config/fingerprint_test.go new file mode 100644 index 00000000000..e9759f0403d --- /dev/null +++ b/client/config/fingerprint_test.go @@ -0,0 +1,292 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package config + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/ci" + "github.com/shoenig/test/must" +) + +func TestFingerprint_Copy(t *testing.T) { + ci.Parallel(t) + + t.Run("nil", func(t *testing.T) { + var f *Fingerprint + result := f.Copy() + must.Nil(t, result) + }) + + t.Run("full", func(t *testing.T) { + exitOnFailure := true + original := &Fingerprint{ + Name: "env_aws", + RetryInterval: 5 * time.Minute, + RetryIntervalHCL: "5m", + RetryAttempts: 3, + ExitOnFailure: &exitOnFailure, + } + + copied := original.Copy() + + must.Eq(t, original.Name, copied.Name) + must.Eq(t, original.RetryInterval, copied.RetryInterval) + must.Eq(t, original.RetryIntervalHCL, copied.RetryIntervalHCL) + must.Eq(t, original.RetryAttempts, copied.RetryAttempts) + must.Eq(t, *original.ExitOnFailure, *copied.ExitOnFailure) + }) + + t.Run("empty", func(t *testing.T) { + original := &Fingerprint{} + copied := original.Copy() + + must.NotNil(t, copied) + must.Eq(t, "", copied.Name) + must.Eq(t, time.Duration(0), copied.RetryInterval) + must.Eq(t, "", copied.RetryIntervalHCL) + must.Eq(t, 0, copied.RetryAttempts) + must.Nil(t, copied.ExitOnFailure) + }) +} + +func TestFingerprint_Merge(t *testing.T) { + ci.Parallel(t) + + t.Run("nil receiver", func(t *testing.T) { + var f *Fingerprint + other := &Fingerprint{ + Name: "env_aws", + RetryInterval: 5 * time.Minute, + } + + result := f.Merge(other) + must.Eq(t, other, result) + }) + + t.Run("nil argument", func(t *testing.T) { + exitOnFailure := true + f := &Fingerprint{ + Name: "env_aws", + RetryInterval: 5 * time.Minute, + RetryAttempts: 3, + ExitOnFailure: &exitOnFailure, + } + + result := f.Merge(nil) + must.Eq(t, f.Name, result.Name) + must.Eq(t, f.RetryInterval, result.RetryInterval) + must.Eq(t, f.RetryAttempts, result.RetryAttempts) + must.Eq(t, *f.ExitOnFailure, *result.ExitOnFailure) + }) + + t.Run("merge overwrites non-zero values", func(t *testing.T) { + exitOnFailure1 := false + exitOnFailure2 := true + + base := &Fingerprint{ + Name: "env_aws", + RetryInterval: 5 * time.Minute, + RetryIntervalHCL: "5m", + RetryAttempts: 3, + ExitOnFailure: &exitOnFailure1, + } + + override := &Fingerprint{ + Name: "env_aws", + RetryInterval: 10 * time.Minute, + RetryIntervalHCL: "10m", + RetryAttempts: 5, + ExitOnFailure: &exitOnFailure2, + } + + result := base.Merge(override) + + must.Eq(t, "env_aws", result.Name) + must.Eq(t, 10*time.Minute, result.RetryInterval) + must.Eq(t, "10m", result.RetryIntervalHCL) + must.Eq(t, 5, result.RetryAttempts) + must.True(t, *result.ExitOnFailure) + }) + + t.Run("merge preserves base values for zero values in override", func(t *testing.T) { + exitOnFailure := true + base := &Fingerprint{ + Name: "env_aws", + RetryInterval: 5 * time.Minute, + RetryIntervalHCL: "5m", + RetryAttempts: 3, + ExitOnFailure: &exitOnFailure, + } + + override := &Fingerprint{ + Name: "env_aws", + } + + result := base.Merge(override) + + must.Eq(t, "env_aws", result.Name) + must.Eq(t, 5*time.Minute, result.RetryInterval) + must.Eq(t, "5m", result.RetryIntervalHCL) + must.Eq(t, 3, result.RetryAttempts) + must.True(t, *result.ExitOnFailure) + }) + + t.Run("merge partial override", func(t *testing.T) { + base := &Fingerprint{ + Name: "env_azure", + RetryInterval: 5 * time.Minute, + RetryAttempts: 3, + } + + newExitOnFailure := true + override := &Fingerprint{ + Name: "env_azure", + RetryAttempts: 10, + ExitOnFailure: &newExitOnFailure, + } + + result := base.Merge(override) + + must.Eq(t, "env_azure", result.Name) + must.Eq(t, 5*time.Minute, result.RetryInterval) + must.Eq(t, 10, result.RetryAttempts) + must.True(t, *result.ExitOnFailure) + }) + + t.Run("merge does not mutate original", func(t *testing.T) { + base := &Fingerprint{ + Name: "env_gce", + RetryInterval: 5 * time.Minute, + RetryAttempts: 3, + } + + override := &Fingerprint{ + Name: "env_gce", + RetryInterval: 10 * time.Minute, + } + + result := base.Merge(override) + + must.Eq(t, 5*time.Minute, base.RetryInterval) + must.Eq(t, 3, base.RetryAttempts) + must.Eq(t, 10*time.Minute, result.RetryInterval) + must.Eq(t, 3, result.RetryAttempts) + }) +} + +func TestFingerprint_Validate(t *testing.T) { + ci.Parallel(t) + + t.Run("nil fingerprint", func(t *testing.T) { + var f *Fingerprint + must.NoError(t, f.Validate()) + }) + + t.Run("empty name", func(t *testing.T) { + f := &Fingerprint{ + Name: "", + } + must.ErrorContains(t, f.Validate(), "fingerprint name cannot be empty") + }) + + t.Run("invalid fingerprinter name", func(t *testing.T) { + f := &Fingerprint{ + Name: "invalid_fingerprinter", + } + must.ErrorContains(t, f.Validate(), "does not support configuration") + }) + + t.Run("negative retry interval", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_aws", + RetryInterval: -5 * time.Minute, + } + must.ErrorContains(t, f.Validate(), "retry interval cannot be negative") + }) + + t.Run("retry attempts less than -1", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_aws", + RetryAttempts: -2, + } + must.ErrorContains(t, f.Validate(), "retry attempts cannot be less than -1") + }) + + t.Run("retry attempts of -1 is valid", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_aws", + RetryAttempts: -1, + } + must.NoError(t, f.Validate()) + }) + + t.Run("valid env_aws fingerprint", func(t *testing.T) { + exitOnFailure := true + f := &Fingerprint{ + Name: "env_aws", + RetryInterval: 5 * time.Minute, + RetryAttempts: 3, + ExitOnFailure: &exitOnFailure, + } + must.NoError(t, f.Validate()) + }) + + t.Run("valid env_azure fingerprint", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_azure", + RetryInterval: 10 * time.Minute, + RetryAttempts: 5, + } + must.NoError(t, f.Validate()) + }) + + t.Run("valid env_gce fingerprint", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_gce", + RetryInterval: 2 * time.Minute, + RetryAttempts: 10, + } + must.NoError(t, f.Validate()) + }) + + t.Run("valid env_digitalocean fingerprint", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_digitalocean", + RetryInterval: 1 * time.Minute, + RetryAttempts: 0, + } + must.NoError(t, f.Validate()) + }) + + t.Run("valid fingerprint with zero values", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_aws", + RetryInterval: 0, + RetryAttempts: 0, + } + must.NoError(t, f.Validate()) + }) + + t.Run("unknown keys", func(t *testing.T) { + f := &Fingerprint{ + Name: "env_aws", + RetryInterval: 0, + RetryAttempts: 0, + ExtraKeysHCL: []string{"bad"}, + } + must.ErrorContains(t, f.Validate(), "unknown configuration options: bad") + }) +} + +func Test_validEnvFingerprinters(t *testing.T) { + expectedFingerprinters := []string{ + "env_aws", + "env_azure", + "env_gce", + "env_digitalocean", + } + must.Eq(t, expectedFingerprinters, validEnvFingerprinters) +} diff --git a/client/fingerprint/env_aws.go b/client/fingerprint/env_aws.go index 1944bb489db..8f4d856de6e 100644 --- a/client/fingerprint/env_aws.go +++ b/client/fingerprint/env_aws.go @@ -26,6 +26,10 @@ const ( // AwsMetadataTimeout is the timeout used when contacting the AWS metadata // services. AwsMetadataTimeout = 2 * time.Second + + // awsFingerprinterName is the name of the AWS fingerprinter and is used + // in configuration and logging. + awsFingerprinterName = "env_aws" ) // map of instance type to approximate speed, in Mbits/s @@ -59,12 +63,17 @@ type EnvAWSFingerprint struct { logger log.Logger } -// NewEnvAWSFingerprint is used to create a fingerprint from AWS metadata +// NewEnvAWSFingerprint is used to create a fingerprint from AWS metadata. It +// wraps the fingerprinter in a retry wrapper. func NewEnvAWSFingerprint(logger log.Logger) Fingerprint { - f := &EnvAWSFingerprint{ - logger: logger.Named("env_aws"), - } - return f + namedLogger := logger.Named(awsFingerprinterName) + return NewRetryWrapper( + &EnvAWSFingerprint{ + logger: namedLogger, + }, + namedLogger, + awsFingerprinterName, + ) } func (f *EnvAWSFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error { @@ -85,9 +94,8 @@ func (f *EnvAWSFingerprint) Fingerprint(request *FingerprintRequest, response *F return fmt.Errorf("failed to setup IMDS client: %v", err) } - if !isAWS(ctx, imdsClient) { - f.logger.Debug("error querying AWS IDMS URL, skipping") - return nil + if err := awsProbe(ctx, imdsClient); err != nil { + return wrapProbeError(err) } // Keys and whether they should be namespaced as unique. Any key whose value @@ -271,20 +279,22 @@ func (f *EnvAWSFingerprint) imdsClient(ctx context.Context) (*imds.Client, error return imdsClient, nil } -func isAWS(ctx context.Context, client *imds.Client) bool { - resp, err := client.GetMetadata(ctx, &imds.GetMetadataInput{ - Path: "ami-id", - }) +func awsProbe(ctx context.Context, client *imds.Client) error { + resp, err := client.GetMetadata(ctx, &imds.GetMetadataInput{Path: "ami-id"}) if err != nil { - return false + return fmt.Errorf("failed to query AWS metadata: %w", err) } s, err := readMetadataResponse(resp) if err != nil { - return false + return fmt.Errorf("failed to read respose: %w", err) + } + + if s == "" { + return errors.New("empty response from AWS metadata") } - return s != "" + return nil } // readImdsResponse reads and formats the IMDS response diff --git a/client/fingerprint/env_aws_test.go b/client/fingerprint/env_aws_test.go index f2fd416d334..89644d69868 100644 --- a/client/fingerprint/env_aws_test.go +++ b/client/fingerprint/env_aws_test.go @@ -19,11 +19,25 @@ import ( "github.com/stretchr/testify/require" ) +func Test_NewEnvAWSFingerprint(t *testing.T) { + ci.Parallel(t) + + f := NewEnvAWSFingerprint(testlog.HCLogger(t)) + must.NotNil(t, f) + + retryWrapper, ok := f.(*RetryWrapper) + must.True(t, ok) + must.Eq(t, awsFingerprinterName, retryWrapper.name) + + _, ok = retryWrapper.fingerprinter.(*EnvAWSFingerprint) + must.True(t, ok) +} + func TestEnvAWSFingerprint_nonAws(t *testing.T) { ci.Parallel(t) f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = "http://127.0.0.1/latest" + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = "http://127.0.0.1/latest" node := &structs.Node{ Attributes: make(map[string]string), @@ -43,7 +57,7 @@ func TestEnvAWSFingerprint_aws(t *testing.T) { defer cleanup() f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = endpoint + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = endpoint node := &structs.Node{ Attributes: make(map[string]string), @@ -114,7 +128,7 @@ func TestEnvAWSFingerprint_handleImdsError(t *testing.T) { } for _, c := range cases { - err := f.(*EnvAWSFingerprint).handleImdsError(c.err, "some attribute") + err := f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).handleImdsError(c.err, "some attribute") must.Eq(t, c.exp, err) } } @@ -126,7 +140,7 @@ func TestNetworkFingerprint_AWS(t *testing.T) { defer cleanup() f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = endpoint + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = endpoint node := &structs.Node{ Attributes: make(map[string]string), @@ -156,7 +170,7 @@ func TestNetworkFingerprint_AWS_network(t *testing.T) { defer cleanup() f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = endpoint + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = endpoint { node := &structs.Node{ @@ -219,7 +233,7 @@ func TestNetworkFingerprint_AWS_NoNetwork(t *testing.T) { defer cleanup() f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = endpoint + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = endpoint node := &structs.Node{ Attributes: make(map[string]string), @@ -247,7 +261,7 @@ func TestNetworkFingerprint_AWS_IncompleteImitation(t *testing.T) { defer cleanup() f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = endpoint + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = endpoint node := &structs.Node{ Attributes: make(map[string]string), @@ -271,7 +285,7 @@ func TestCPUFingerprint_AWS_InstanceFound(t *testing.T) { defer cleanup() f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = endpoint + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = endpoint node := &structs.Node{Attributes: make(map[string]string)} @@ -289,7 +303,7 @@ func TestCPUFingerprint_AWS_InstanceNotFound(t *testing.T) { defer cleanup() f := NewEnvAWSFingerprint(testlog.HCLogger(t)) - f.(*EnvAWSFingerprint).endpoint = endpoint + f.(*RetryWrapper).fingerprinter.(*EnvAWSFingerprint).endpoint = endpoint node := &structs.Node{Attributes: make(map[string]string)} diff --git a/client/fingerprint/env_azure.go b/client/fingerprint/env_azure.go index 5f0ed3faaef..ddc7d9d3de0 100644 --- a/client/fingerprint/env_azure.go +++ b/client/fingerprint/env_azure.go @@ -5,6 +5,7 @@ package fingerprint import ( "encoding/json" + "errors" "fmt" "io" "net/http" @@ -31,6 +32,10 @@ const ( // AzureMetadataTimeout is the timeout used when contacting the Azure metadata // services. AzureMetadataTimeout = 2 * time.Second + + // azureFingerprinterName is the name of the Azure fingerprinter and used in + // configuration and logging. + azureFingerprinterName = "env_azure" ) type AzureMetadataTag struct { @@ -66,11 +71,17 @@ func NewEnvAzureFingerprint(logger log.Logger) Fingerprint { Transport: cleanhttp.DefaultTransport(), } - return &EnvAzureFingerprint{ - client: client, - logger: logger.Named("env_azure"), - metadataURL: metadataURL, - } + namedLogger := logger.Named(azureFingerprinterName) + + return NewRetryWrapper( + &EnvAzureFingerprint{ + client: client, + logger: namedLogger, + metadataURL: metadataURL, + }, + namedLogger, + azureFingerprinterName, + ) } func (f *EnvAzureFingerprint) Get(attribute string, format string) (string, error) { @@ -131,8 +142,8 @@ func (f *EnvAzureFingerprint) Fingerprint(request *FingerprintRequest, response f.client.Timeout = 1 * time.Millisecond } - if !f.isAzure() { - return nil + if err := f.azureProbe(); err != nil { + return wrapProbeError(err) } // Keys and whether they should be namespaced as unique. Any key whose value @@ -209,8 +220,15 @@ func (f *EnvAzureFingerprint) Fingerprint(request *FingerprintRequest, response return nil } -func (f *EnvAzureFingerprint) isAzure() bool { +func (f *EnvAzureFingerprint) azureProbe() error { v, err := f.Get("compute/azEnvironment", "text") - v = strings.TrimSpace(v) - return err == nil && v != "" + if err != nil { + return err + } + + if v = strings.TrimSpace(v); v == "" { + return errors.New("empty AZ Environment value") + } + + return nil } diff --git a/client/fingerprint/env_azure_test.go b/client/fingerprint/env_azure_test.go index f796306a603..cceb3a11a1e 100644 --- a/client/fingerprint/env_azure_test.go +++ b/client/fingerprint/env_azure_test.go @@ -11,11 +11,27 @@ import ( "strings" "testing" + "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" ) +func Test_NewEnvAzureFingerprint(t *testing.T) { + ci.Parallel(t) + + f := NewEnvAzureFingerprint(testlog.HCLogger(t)) + must.NotNil(t, f) + + retryWrapper, ok := f.(*RetryWrapper) + must.True(t, ok) + must.Eq(t, azureFingerprinterName, retryWrapper.name) + + _, ok = retryWrapper.fingerprinter.(*EnvAzureFingerprint) + must.True(t, ok) +} + func TestAzureFingerprint_nonAzure(t *testing.T) { t.Setenv("AZURE_ENV_URL", "http://127.0.0.1/metadata/instance/") @@ -45,57 +61,10 @@ func testFingerprint_Azure(t *testing.T, withExternalIp bool) { Attributes: make(map[string]string), } - // configure mock server with fixture routes, data - routes := routes{} - if err := json.Unmarshal([]byte(AZURE_routes), &routes); err != nil { - t.Fatalf("Failed to unmarshal JSON in GCE ENV test: %s", err) - } - if withExternalIp { - networkEndpoint := &endpoint{ - Uri: "/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress", - ContentType: "text/plain", - Body: "104.44.55.66", - } - routes.Endpoints = append(routes.Endpoints, networkEndpoint) - } + testMetadataServer := azureTestMetadataServer(t, withExternalIp) + defer testMetadataServer.Close() - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - value, ok := r.Header["Metadata"] - if !ok { - t.Fatal("Metadata not present in HTTP request header") - } - if value[0] != "true" { - t.Fatalf("Expected Metadata true, saw %s", value[0]) - } - - uavalue, ok := r.Header["User-Agent"] - if !ok { - t.Fatal("User-Agent not present in HTTP request header") - } - if !strings.Contains(uavalue[0], "Nomad/") { - t.Fatalf("Expected User-Agent to contain Nomad/, got %s", uavalue[0]) - } - - uri := r.RequestURI - if r.URL.RawQuery != "" { - uri = strings.Replace(uri, "?"+r.URL.RawQuery, "", 1) - } - - found := false - for _, e := range routes.Endpoints { - if uri == e.Uri { - w.Header().Set("Content-Type", e.ContentType) - fmt.Fprintln(w, e.Body) - found = true - } - } - - if !found { - w.WriteHeader(http.StatusNotFound) - } - })) - defer ts.Close() - t.Setenv("AZURE_ENV_URL", ts.URL+"/metadata/instance/") + t.Setenv("AZURE_ENV_URL", testMetadataServer.URL+"/metadata/instance/") f := NewEnvAzureFingerprint(testlog.HCLogger(t)) request := &FingerprintRequest{Config: &config.Config{}, Node: node} @@ -155,6 +124,50 @@ func testFingerprint_Azure(t *testing.T, withExternalIp bool) { } +func azureTestMetadataServer(t *testing.T, externalIP bool) *httptest.Server { + + // configure mock server with fixture routes, data + routes := routes{} + must.NoError(t, json.Unmarshal([]byte(AZURE_routes), &routes)) + + if externalIP { + networkEndpoint := &endpoint{ + Uri: "/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress", + ContentType: "text/plain", + Body: "104.44.55.66", + } + routes.Endpoints = append(routes.Endpoints, networkEndpoint) + } + + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + value, ok := r.Header["Metadata"] + must.True(t, ok) + must.Eq(t, "true", value[0]) + + uavalue, ok := r.Header["User-Agent"] + must.True(t, ok) + must.StrContains(t, uavalue[0], "Nomad/") + + uri := r.RequestURI + if r.URL.RawQuery != "" { + uri = strings.Replace(uri, "?"+r.URL.RawQuery, "", 1) + } + + found := false + for _, e := range routes.Endpoints { + if uri == e.Uri { + w.Header().Set("Content-Type", e.ContentType) + fmt.Fprintln(w, e.Body) + found = true + } + } + + if !found { + w.WriteHeader(http.StatusNotFound) + } + })) +} + const AZURE_routes = ` { "endpoints": [ @@ -220,3 +233,44 @@ func TestFingerprint_AzureWithExternalIp(t *testing.T) { func TestFingerprint_AzureWithoutExternalIp(t *testing.T) { testFingerprint_Azure(t, false) } + +func TestEnvAzureFingerprint_azureProbe(t *testing.T) { + + testCases := []struct { + name string + azureEnv bool + }{ + { + name: "Azure Environment", + azureEnv: true, + }, + { + name: "Non-Azure Environment", + azureEnv: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + if tc.azureEnv { + testMetadataServer := azureTestMetadataServer(t, false) + defer testMetadataServer.Close() + t.Setenv("AZURE_ENV_URL", testMetadataServer.URL+"/metadata/instance/") + } else { + // GitHub Actions run in Azure, so we point to a non-existent + // server to simulate a non-Azure environment. + t.Setenv("AZURE_ENV_URL", "http://127.0.0.1/metadata/instance/") + } + + f := NewEnvAzureFingerprint(testlog.HCLogger(t)) + err := f.(*RetryWrapper).fingerprinter.(*EnvAzureFingerprint).azureProbe() + + if tc.azureEnv { + must.NoError(t, err) + } else { + must.Error(t, err) + } + }) + } +} diff --git a/client/fingerprint/env_digitalocean.go b/client/fingerprint/env_digitalocean.go index 0a7d3713838..ba7480843f2 100644 --- a/client/fingerprint/env_digitalocean.go +++ b/client/fingerprint/env_digitalocean.go @@ -4,6 +4,7 @@ package fingerprint import ( + "errors" "fmt" "io" "net/http" @@ -27,6 +28,10 @@ const ( // DigitalOceanMetadataTimeout is the timeout used when contacting the DigitalOcean metadata // services. DigitalOceanMetadataTimeout = 2 * time.Second + + // digitalOceanFingerprinterName is the name of the Digital Ocean + // fingerprinter and used in configuration and logging. + digitalOceanFingerprinterName = "env_digitalocean" ) type DigitalOceanMetadataPair struct { @@ -57,11 +62,17 @@ func NewEnvDigitalOceanFingerprint(logger log.Logger) Fingerprint { Transport: cleanhttp.DefaultTransport(), } - return &EnvDigitalOceanFingerprint{ - client: client, - logger: logger.Named("env_digitalocean"), - metadataURL: metadataURL, - } + namedLogger := logger.Named(digitalOceanFingerprinterName) + + return NewRetryWrapper( + &EnvDigitalOceanFingerprint{ + client: client, + logger: namedLogger, + metadataURL: metadataURL, + }, + namedLogger, + digitalOceanFingerprinterName, + ) } func (f *EnvDigitalOceanFingerprint) Get(attribute string, format string) (string, error) { @@ -108,8 +119,8 @@ func (f *EnvDigitalOceanFingerprint) Fingerprint(request *FingerprintRequest, re f.client.Timeout = 1 * time.Millisecond } - if !f.isDigitalOcean() { - return nil + if err := f.digitalOceanProbe(); err != nil { + return wrapProbeError(err) } // Keys and whether they should be namespaced as unique. Any key whose value @@ -159,8 +170,14 @@ func (f *EnvDigitalOceanFingerprint) Fingerprint(request *FingerprintRequest, re return nil } -func (f *EnvDigitalOceanFingerprint) isDigitalOcean() bool { +func (f *EnvDigitalOceanFingerprint) digitalOceanProbe() error { v, err := f.Get("region", "text") - v = strings.TrimSpace(v) - return err == nil && v != "" + if err != nil { + return err + } + + if v = strings.TrimSpace(v); v == "" { + return errors.New("empty region value") + } + return nil } diff --git a/client/fingerprint/env_digitalocean_test.go b/client/fingerprint/env_digitalocean_test.go index 80442877151..6b607129c18 100644 --- a/client/fingerprint/env_digitalocean_test.go +++ b/client/fingerprint/env_digitalocean_test.go @@ -11,12 +11,28 @@ import ( "strings" "testing" + "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" ) +func Test_NewEnvDigitalOceanFingerprint(t *testing.T) { + ci.Parallel(t) + + f := NewEnvDigitalOceanFingerprint(testlog.HCLogger(t)) + must.NotNil(t, f) + + retryWrapper, ok := f.(*RetryWrapper) + must.True(t, ok) + must.Eq(t, digitalOceanFingerprinterName, retryWrapper.name) + + _, ok = retryWrapper.fingerprinter.(*EnvDigitalOceanFingerprint) + must.True(t, ok) +} + func TestDigitalOceanFingerprint_nonDigitalOcean(t *testing.T) { t.Setenv("DO_ENV_URL", "http://127.0.0.1/metadata/v1/") @@ -47,41 +63,10 @@ func TestFingerprint_DigitalOcean(t *testing.T) { Attributes: make(map[string]string), } - // configure mock server with fixture routes, data - routes := routes{} - if err := json.Unmarshal([]byte(DO_routes), &routes); err != nil { - t.Fatalf("Failed to unmarshal JSON in DO ENV test: %s", err) - } - - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - uavalue, ok := r.Header["User-Agent"] - if !ok { - t.Fatal("User-Agent not present in HTTP request header") - } - if !strings.Contains(uavalue[0], "Nomad/") { - t.Fatalf("Expected User-Agent to contain Nomad/, got %s", uavalue[0]) - } - - uri := r.RequestURI - if r.URL.RawQuery != "" { - uri = strings.Replace(uri, "?"+r.URL.RawQuery, "", 1) - } + testMetadataServer := digitalOceanTestMetadataServer(t) + defer testMetadataServer.Close() - found := false - for _, e := range routes.Endpoints { - if uri == e.Uri { - w.Header().Set("Content-Type", e.ContentType) - fmt.Fprintln(w, e.Body) - found = true - } - } - - if !found { - w.WriteHeader(http.StatusNotFound) - } - })) - defer ts.Close() - t.Setenv("DO_ENV_URL", ts.URL+"/metadata/v1/") + t.Setenv("DO_ENV_URL", testMetadataServer.URL+"/metadata/v1/") f := NewEnvDigitalOceanFingerprint(testlog.HCLogger(t)) request := &FingerprintRequest{Config: &config.Config{}, Node: node} @@ -120,6 +105,74 @@ func TestFingerprint_DigitalOcean(t *testing.T) { assertNodeAttributeEquals(t, response.Attributes, "unique.platform.digitalocean.public-ipv6", "c99c:8ac5:3112:204b:48b0:41aa:e085:d11a") } +func TestEnvDigitalOceanFingerprint_digitalOceanProbe(t *testing.T) { + + testCases := []struct { + name string + doEnv bool + }{ + { + name: "Digital Ocean Environment", + doEnv: true, + }, + { + name: "Non-Digital Ocean Environment", + doEnv: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + if tc.doEnv { + testMetadataServer := digitalOceanTestMetadataServer(t) + defer testMetadataServer.Close() + t.Setenv("DO_ENV_URL", testMetadataServer.URL+"/metadata/v1/") + } + + f := NewEnvDigitalOceanFingerprint(testlog.HCLogger(t)) + err := f.(*RetryWrapper).fingerprinter.(*EnvDigitalOceanFingerprint).digitalOceanProbe() + + if tc.doEnv { + must.NoError(t, err) + } else { + must.Error(t, err) + } + }) + } +} + +func digitalOceanTestMetadataServer(t *testing.T) *httptest.Server { + + // configure mock server with fixture routes, data. + routes := routes{} + must.NoError(t, json.Unmarshal([]byte(DO_routes), &routes)) + + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + uavalue, ok := r.Header["User-Agent"] + must.True(t, ok) + must.StrContains(t, uavalue[0], "Nomad/") + + uri := r.RequestURI + if r.URL.RawQuery != "" { + uri = strings.Replace(uri, "?"+r.URL.RawQuery, "", 1) + } + + found := false + for _, e := range routes.Endpoints { + if uri == e.Uri { + w.Header().Set("Content-Type", e.ContentType) + fmt.Fprintln(w, e.Body) + found = true + } + } + + if !found { + w.WriteHeader(http.StatusNotFound) + } + })) +} + const DO_routes = ` { "endpoints": [ diff --git a/client/fingerprint/env_gce.go b/client/fingerprint/env_gce.go index 7d773c6bb5e..36c01d5c353 100644 --- a/client/fingerprint/env_gce.go +++ b/client/fingerprint/env_gce.go @@ -30,6 +30,10 @@ const ( // GceMetadataTimeout is the timeout used when contacting the GCE metadata // service GceMetadataTimeout = 2 * time.Second + + // gceFingerprinterName is the name of the GCE fingerprinter and used in + // configuration and logging. + gceFingerprinterName = "env_gce" ) type GCEMetadataNetworkInterface struct { @@ -78,11 +82,17 @@ func NewEnvGCEFingerprint(logger log.Logger) Fingerprint { Transport: cleanhttp.DefaultTransport(), } - return &EnvGCEFingerprint{ - client: client, - logger: logger.Named("env_gce"), - metadataURL: metadataURL, - } + namedLogger := logger.Named(gceFingerprinterName) + + return NewRetryWrapper( + &EnvGCEFingerprint{ + client: client, + logger: namedLogger, + metadataURL: metadataURL, + }, + namedLogger, + gceFingerprinterName, + ) } func (f *EnvGCEFingerprint) Get(attribute string, recursive bool) (string, error) { @@ -147,8 +157,8 @@ func (f *EnvGCEFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerpri f.client.Timeout = 1 * time.Millisecond } - if !f.isGCE() { - return nil + if err := f.gceProbe(); err != nil { + return wrapProbeError(err) } // Keys and whether they should be namespaced as unique. Any key whose value @@ -275,23 +285,23 @@ func (f *EnvGCEFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerpri return nil } -func (f *EnvGCEFingerprint) isGCE() bool { +func (f *EnvGCEFingerprint) gceProbe() error { // TODO: better way to detect GCE? - // Query the metadata url for the machine type, to verify we're on GCE + // Query the metadata url for the machine type, to verify we're on GCE. machineType, err := f.Get("machine-type", false) if err != nil { - if re, ok := err.(ReqError); !ok || re.StatusCode != http.StatusNotFound { - // If it wasn't a 404 error, print an error message. - f.logger.Debug("error querying GCE Metadata URL, skipping") - } - return false + return err } match, err := regexp.MatchString("projects/.+/machineTypes/.+", machineType) - if err != nil || !match { - return false + if err != nil { + return err } - return true + if !match { + return fmt.Errorf("GCE machine-type format invalid: %s", machineType) + } + + return nil } diff --git a/client/fingerprint/env_gce_test.go b/client/fingerprint/env_gce_test.go index b04169bd5c7..45045a496b9 100644 --- a/client/fingerprint/env_gce_test.go +++ b/client/fingerprint/env_gce_test.go @@ -8,14 +8,29 @@ import ( "fmt" "net/http" "net/http/httptest" - "strings" "testing" + "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" ) +func Test_NewEnvGCEFingerprint(t *testing.T) { + ci.Parallel(t) + + f := NewEnvGCEFingerprint(testlog.HCLogger(t)) + must.NotNil(t, f) + + retryWrapper, ok := f.(*RetryWrapper) + must.True(t, ok) + must.Eq(t, gceFingerprinterName, retryWrapper.name) + + _, ok = retryWrapper.fingerprinter.(*EnvGCEFingerprint) + must.True(t, ok) +} + func TestGCEFingerprint_nonGCE(t *testing.T) { t.Setenv("GCE_ENV_URL", "http://127.0.0.1/computeMetadata/v1/instance/") @@ -45,54 +60,10 @@ func testFingerprint_GCE(t *testing.T, withExternalIp bool) { Attributes: make(map[string]string), } - // configure mock server with fixture routes, data - routes := routes{} - if err := json.Unmarshal([]byte(GCE_routes), &routes); err != nil { - t.Fatalf("Failed to unmarshal JSON in GCE ENV test: %s", err) - } - networkEndpoint := &endpoint{ - Uri: "/computeMetadata/v1/instance/network-interfaces/?recursive=true", - ContentType: "application/json", - } - if withExternalIp { - networkEndpoint.Body = `[{"accessConfigs":[{"externalIp":"104.44.55.66","type":"ONE_TO_ONE_NAT"},{"externalIp":"104.44.55.67","type":"ONE_TO_ONE_NAT"}],"forwardedIps":[],"ip":"10.240.0.5","network":"projects/555555/networks/default"}]` - } else { - networkEndpoint.Body = `[{"accessConfigs":[],"forwardedIps":[],"ip":"10.240.0.5","network":"projects/555555/networks/default"}]` - } - routes.Endpoints = append(routes.Endpoints, networkEndpoint) + testMetadataServer := gceTestMetadataServer(t, withExternalIp) + defer testMetadataServer.Close() - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - value, ok := r.Header["Metadata-Flavor"] - if !ok { - t.Fatal("Metadata-Flavor not present in HTTP request header") - } - if value[0] != "Google" { - t.Fatalf("Expected Metadata-Flavor Google, saw %s", value[0]) - } - - uavalue, ok := r.Header["User-Agent"] - if !ok { - t.Fatal("User-Agent not present in HTTP request header") - } - if !strings.Contains(uavalue[0], "Nomad/") { - t.Fatalf("Expected User-Agent to contain Nomad/, got %s", uavalue[0]) - } - - found := false - for _, e := range routes.Endpoints { - if r.RequestURI == e.Uri { - w.Header().Set("Content-Type", e.ContentType) - fmt.Fprintln(w, e.Body) - } - found = true - } - - if !found { - w.WriteHeader(http.StatusNotFound) - } - })) - defer ts.Close() - t.Setenv("GCE_ENV_URL", ts.URL+"/computeMetadata/v1/instance/") + t.Setenv("GCE_ENV_URL", testMetadataServer.URL+"/computeMetadata/v1/instance/") f := NewEnvGCEFingerprint(testlog.HCLogger(t)) request := &FingerprintRequest{Config: &config.Config{}, Node: node} @@ -158,6 +129,84 @@ func testFingerprint_GCE(t *testing.T, withExternalIp bool) { assertNodeAttributeEquals(t, response.Attributes, "unique.platform.gce.attr.bar", "333") } +func TestEnvGCEFingerprint_gceProbe(t *testing.T) { + + testCases := []struct { + name string + gceEnv bool + }{ + { + name: "GCE Environment", + gceEnv: true, + }, + { + name: "Non-GCE Environment", + gceEnv: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + if tc.gceEnv { + testMetadataServer := gceTestMetadataServer(t, false) + defer testMetadataServer.Close() + t.Setenv("GCE_ENV_URL", testMetadataServer.URL+"/computeMetadata/v1/instance/") + } + + f := NewEnvGCEFingerprint(testlog.HCLogger(t)) + err := f.(*RetryWrapper).fingerprinter.(*EnvGCEFingerprint).gceProbe() + + if tc.gceEnv { + must.NoError(t, err) + } else { + must.Error(t, err) + } + }) + } +} + +func gceTestMetadataServer(t *testing.T, externalIP bool) *httptest.Server { + + // configure mock server with fixture routes, data. + routes := routes{} + must.NoError(t, json.Unmarshal([]byte(GCE_routes), &routes)) + + networkEndpoint := &endpoint{ + Uri: "/computeMetadata/v1/instance/network-interfaces/?recursive=true", + ContentType: "application/json", + } + if externalIP { + networkEndpoint.Body = `[{"accessConfigs":[{"externalIp":"104.44.55.66","type":"ONE_TO_ONE_NAT"},{"externalIp":"104.44.55.67","type":"ONE_TO_ONE_NAT"}],"forwardedIps":[],"ip":"10.240.0.5","network":"projects/555555/networks/default"}]` + } else { + networkEndpoint.Body = `[{"accessConfigs":[],"forwardedIps":[],"ip":"10.240.0.5","network":"projects/555555/networks/default"}]` + } + routes.Endpoints = append(routes.Endpoints, networkEndpoint) + + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + value, ok := r.Header["Metadata-Flavor"] + must.True(t, ok) + must.Eq(t, "Google", value[0]) + + uavalue, ok := r.Header["User-Agent"] + must.True(t, ok) + must.StrContains(t, uavalue[0], "Nomad/") + + found := false + for _, e := range routes.Endpoints { + if r.RequestURI == e.Uri { + w.Header().Set("Content-Type", e.ContentType) + fmt.Fprintln(w, e.Body) + } + found = true + } + + if !found { + w.WriteHeader(http.StatusNotFound) + } + })) +} + const GCE_routes = ` { "endpoints": [ diff --git a/client/fingerprint/fingerprint_retry.go b/client/fingerprint/fingerprint_retry.go new file mode 100644 index 00000000000..2b5bbe96b68 --- /dev/null +++ b/client/fingerprint/fingerprint_retry.go @@ -0,0 +1,148 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package fingerprint + +import ( + "errors" + "fmt" + "time" + + "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/nomad/client/config" +) + +// RetryWrapper is a fingerprinter wrapper that adds retry logic to an existing +// fingerprinter. This is currently supported for environment fingerprinters +// only and is controller via the client fingerprinter config. +type RetryWrapper struct { + + // fingerprinter is the underlying fingerprinter being wrapped with retry + // logic. + fingerprinter Fingerprint + + // name is the name of the fingerprinter being wrapped and is used to pull + // any configuration for it. + name string + + logger hclog.Logger + + // StaticFingerprinter is embedded to indicate that this fingerprinter does + // not support periodic execution. + StaticFingerprinter +} + +// NewRetryWrapper wraps the passed fingerprinter with retry logic. The returned +// fingerprinter will consult the client configuration for any retry settings. +// +// It staisifes the Fingerprinter interface and is a static fingerprinter, so +// does not support periodic execution. +func NewRetryWrapper(fingerprinter Fingerprint, logger hclog.Logger, name string) Fingerprint { + return &RetryWrapper{ + fingerprinter: fingerprinter, + logger: logger, + name: name, + } +} + +// Fingerprint executes the underlying fingerprinter with retry logic based +// on the client configuration and implements the Fingerprinter interface. +// +// If the fingerprinter fails after all retry attempts, the error from the last +// attempt is returned, unless the configuration indicates that failures should +// be skipped for this fingerprinter and the error is of the type that indicates +// an initial probe failure. +func (rw *RetryWrapper) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error { + + cfg := req.Config.Fingerprinters[rw.name] + + var ( + attempts int + err error + ) + + // Ensure we default to a 2 second retry interval if not configured. Doing + // this here means we do not have to do this each loop iteration at the + // cost of the config potentially being empty and the loop exiting after the + // first attempt. + retryInterval := 2 * time.Second + + if cfg != nil && cfg.RetryInterval > 0 { + retryInterval = cfg.RetryInterval + } + + for { + err = rw.fingerprinter.Fingerprint(req, resp) + if err == nil { + return nil + } + + // If the fingerprinter does not have a config, no retry behaviour is + // defined, so exit the loop. + if cfg == nil { + break + } + + var shouldRetry bool + + switch cfg.RetryAttempts { + case -1: + shouldRetry = true + case 0: + default: + if attempts < cfg.RetryAttempts { + shouldRetry = true + } + } + + if !shouldRetry { + break + } + + rw.logger.Warn("fingerprinting failed, retrying", + "current_attempts", attempts, + "retry_attempts", cfg.RetryAttempts, + "retry_interval", retryInterval, + "error", err, + ) + + attempts++ + time.Sleep(retryInterval) + } + + if shouldSkipEnvFingerprinter(cfg, err) { + rw.logger.Debug("error performing initial probe, skipping") + return nil + } + + rw.logger.Error("fingerprinting failed after all attempts", "error", err) + return err +} + +// errEnvProbeQueryFailed is used to indicate that the initial probe to +// determine if the environment fingerprinter is applicable has failed. +var errEnvProbeQueryFailed = errors.New("fingerprint initial probe failed") + +// wrapProbeError wraps the passed error with errEnvProbeQueryFailed to indicate +// that the initial probe has failed. +func wrapProbeError(err error) error { + return fmt.Errorf("%w: %w", errEnvProbeQueryFailed, err) +} + +// shouldSkipEnvFingerprinter determines if an environment fingerprinter should +// be skipped based on the passed configuration and error from the +// fingerprinter. Skipped indicates the client is not running in the environment +// the fingerprinter is designed for. +func shouldSkipEnvFingerprinter(cfg *config.Fingerprint, err error) bool { + + if err == nil { + return false + } + + if cfg != nil && cfg.ExitOnFailure != nil { + return !*cfg.ExitOnFailure + } + + return errors.Is(err, errEnvProbeQueryFailed) +} diff --git a/client/fingerprint/fingerprint_retry_test.go b/client/fingerprint/fingerprint_retry_test.go new file mode 100644 index 00000000000..fad3c29355d --- /dev/null +++ b/client/fingerprint/fingerprint_retry_test.go @@ -0,0 +1,330 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package fingerprint + +import ( + "errors" + "sync/atomic" + "testing" + "time" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/helper/pointer" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" +) + +// mockFingerprinter is a mock implementation of the Fingerprint interface that +// allows controlling behavior for testing the RetryWrapper. +type mockFingerprinter struct { + callCount atomic.Int32 + errorSequence []error + StaticFingerprinter +} + +func (m *mockFingerprinter) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error { + callNum := int(m.callCount.Add(1)) - 1 + + if callNum < len(m.errorSequence) { + if err := m.errorSequence[callNum]; err != nil { + return err + } + } + + return nil +} + +func (m *mockFingerprinter) getCallCount() int { + return int(m.callCount.Load()) +} + +func newMockFingerprinter(errorSequence []error) *mockFingerprinter { + return &mockFingerprinter{ + errorSequence: errorSequence, + } +} + +func TestRetryWrapper_Fingerprint(t *testing.T) { + ci.Parallel(t) + + genericError := errors.New("test error") + probeError := wrapProbeError(genericError) + + testCases := []struct { + name string + errorSequence []error + fpConfig *config.Fingerprint + expectedErr error + expectedCallCount int + }{ + { + name: "success on first attempt", + errorSequence: []error{nil}, + fpConfig: nil, + expectedErr: nil, + expectedCallCount: 1, + }, + { + name: "no config probe error", + errorSequence: []error{probeError}, + fpConfig: nil, + expectedErr: nil, + expectedCallCount: 1, + }, + { + name: "exit on failure probe error", + errorSequence: []error{probeError}, + fpConfig: &config.Fingerprint{ + Name: "test", + ExitOnFailure: pointer.Of(true), + }, + expectedErr: probeError, + expectedCallCount: 1, + }, + { + name: "no exit on failure probe error", + errorSequence: []error{probeError}, + fpConfig: &config.Fingerprint{ + Name: "test", + ExitOnFailure: pointer.Of(false), + }, + expectedErr: nil, + expectedCallCount: 1, + }, + { + name: "no config with error", + errorSequence: []error{ + genericError, + }, + fpConfig: nil, + expectedErr: genericError, + expectedCallCount: 1, + }, + { + name: "retry attempts 0 with error", + errorSequence: []error{ + genericError, + }, + fpConfig: &config.Fingerprint{ + Name: "test", + RetryAttempts: 0, + }, + expectedErr: genericError, + expectedCallCount: 1, + }, + { + name: "retry attempts 1 fails twice", + errorSequence: []error{ + genericError, + genericError, + }, + fpConfig: &config.Fingerprint{ + Name: "test", + RetryAttempts: 1, + RetryInterval: 10 * time.Millisecond, + }, + expectedErr: genericError, + expectedCallCount: 2, + }, + { + name: "retry attempts 1 succeeds on second try", + errorSequence: []error{ + genericError, + nil, + }, + fpConfig: &config.Fingerprint{ + Name: "test", + RetryAttempts: 1, + RetryInterval: 10 * time.Millisecond, + }, + expectedErr: nil, + expectedCallCount: 2, + }, + { + name: "retry attempts 3 succeeds on third try", + errorSequence: []error{ + genericError, + genericError, + nil, + }, + fpConfig: &config.Fingerprint{ + Name: "test", + RetryAttempts: 3, + RetryInterval: 10 * time.Millisecond, + }, + expectedErr: nil, + expectedCallCount: 3, + }, + { + name: "retry attempts 2 fails all attempts", + errorSequence: []error{ + genericError, + genericError, + genericError, + }, + fpConfig: &config.Fingerprint{ + Name: "test", + RetryAttempts: 2, + RetryInterval: 10 * time.Millisecond, + }, + expectedErr: genericError, + expectedCallCount: 3, + }, + { + name: "retry attempts -1 retries indefinitely until success", + errorSequence: []error{ + genericError, + genericError, + genericError, + genericError, + genericError, + genericError, + genericError, + nil, + }, + fpConfig: &config.Fingerprint{ + Name: "test", + RetryAttempts: -1, + RetryInterval: 10 * time.Millisecond, + }, + expectedErr: nil, + expectedCallCount: 8, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + mock := newMockFingerprinter(tc.errorSequence) + + wrapper := NewRetryWrapper(mock, testlog.HCLogger(t), "test") + + cfg := &config.Config{ + Fingerprinters: map[string]*config.Fingerprint{ + "test": tc.fpConfig, + }, + } + + req := &FingerprintRequest{Config: cfg, Node: &structs.Node{}} + var resp FingerprintResponse + + startTime := time.Now() + err := wrapper.Fingerprint(req, &resp) + elapsed := time.Since(startTime) + + // Ensure the correct error response has been obtained. This is the + // final result of the finerprinter run. + if tc.expectedErr != nil { + must.Error(t, err) + must.Eq(t, tc.expectedErr, err) + } else { + must.NoError(t, err) + } + + // Check the retry logic was triggered the expected number of times. + must.Eq(t, tc.expectedCallCount, mock.getCallCount()) + + // Attempt to verify that the fingerprinter took at least the + // expected amount of time, accounting for retries. This gives us + // further confidence that the retry logic was executed correctly. + if tc.fpConfig != nil && tc.expectedCallCount > 1 { + expectedMinTime := tc.fpConfig.RetryInterval + if expectedMinTime == 0 { + expectedMinTime = 2 * time.Second + } + minExpectedDuration := time.Duration(tc.expectedCallCount-1) * expectedMinTime + must.GreaterEq(t, minExpectedDuration-100*time.Millisecond, elapsed) + } + }) + } +} + +func Test_wrapProbeError(t *testing.T) { + ci.Parallel(t) + + baseErr := errors.New("base error") + wrappedErr := wrapProbeError(baseErr) + + must.Error(t, wrappedErr) + must.ErrorIs(t, wrappedErr, errEnvProbeQueryFailed) + must.ErrorIs(t, wrappedErr, baseErr) + must.StrContains(t, wrappedErr.Error(), "fingerprint initial probe failed") + must.StrContains(t, wrappedErr.Error(), "base error") +} + +func Test_shouldSkipEnvFingerprinter(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + inputCfg *config.Fingerprint + inputError error + expectedOutput bool + }{ + { + name: "nil config and nil error", + inputCfg: nil, + inputError: nil, + expectedOutput: false, + }, + { + name: "nil config and initial error", + inputCfg: nil, + inputError: wrapProbeError(errors.New("initial error")), + expectedOutput: true, + }, + { + name: "nil config and non-initial error", + inputCfg: nil, + inputError: errors.New("initial error"), + expectedOutput: false, + }, + { + name: "exit on failure not set and initial error", + inputCfg: &config.Fingerprint{}, + inputError: wrapProbeError(errors.New("initial error")), + expectedOutput: true, + }, + { + name: "exit on failure false and initial error", + inputCfg: &config.Fingerprint{ + ExitOnFailure: pointer.Of(false), + }, + inputError: wrapProbeError(errors.New("initial error")), + expectedOutput: true, + }, + { + name: "exit on failure true and initial error", + inputCfg: &config.Fingerprint{ + ExitOnFailure: pointer.Of(true), + }, + inputError: wrapProbeError(errors.New("initial error")), + expectedOutput: false, + }, + { + name: "exit on failure false and non-initial error", + inputCfg: &config.Fingerprint{ + ExitOnFailure: pointer.Of(false), + }, + inputError: errors.New("initial error"), + expectedOutput: true, + }, + { + name: "exit on failure true and non-initial error", + inputCfg: &config.Fingerprint{ + ExitOnFailure: pointer.Of(true), + }, + inputError: errors.New("initial error"), + expectedOutput: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + must.Eq(t, tc.expectedOutput, shouldSkipEnvFingerprinter(tc.inputCfg, tc.inputError)) + }) + } +} diff --git a/command/agent/agent.go b/command/agent/agent.go index 56dc7089e36..980e48f84f8 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -1080,6 +1080,16 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { conf.Users = clientconfig.UsersConfigFromAgent(agentConfig.Client.Users) + // Iterate the fingerprinter configs and populate the client mapping. The + // validation function returns a suitable error that can be returned without + // formatting. + for _, fingerprinterCfg := range agentConfig.Client.Fingerprinters { + if err := fingerprinterCfg.Validate(); err != nil { + return nil, err + } + conf.Fingerprinters[fingerprinterCfg.Name] = fingerprinterCfg + } + conf.LogFile = agentConfig.LogFile return conf, nil } diff --git a/command/agent/config.go b/command/agent/config.go index 6aeb1d9b388..3ea0c37f62f 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -434,15 +434,20 @@ type ClientConfig struct { // Users is used to configure parameters around operating system users. Users *config.UsersConfig `hcl:"users"` - // ExtraKeysHCL is used by hcl to surface unexpected keys - ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"` - // NodeMaxAllocs sets the maximum number of allocations per node // Defaults to 0 and ignored if unset. NodeMaxAllocs int `hcl:"node_max_allocs"` // LogFile is used by MonitorExport to stream a client's log file LogFile string `hcl:"log_file"` + + // Fingerprinters contains configuration for individual fingerprinters. The + // external configuration is a slice but later converted to a map for + // internal use. + Fingerprinters []*client.Fingerprint `hcl:"fingerprint"` + + // ExtraKeysHCL is used by hcl to surface unexpected keys + ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"` } func (c *ClientConfig) Copy() *ClientConfig { @@ -465,6 +470,7 @@ func (c *ClientConfig) Copy() *ClientConfig { nc.Artifact = c.Artifact.Copy() nc.Drain = c.Drain.Copy() nc.Users = c.Users.Copy() + nc.Fingerprinters = helper.CopySlice(c.Fingerprinters) nc.ExtraKeysHCL = slices.Clone(c.ExtraKeysHCL) return &nc } @@ -1756,6 +1762,7 @@ func DefaultConfig() *Config { Artifact: config.DefaultArtifactConfig(), Drain: nil, Users: config.DefaultUsersConfig(), + Fingerprinters: []*client.Fingerprint{}, }, Server: &ServerConfig{ Enabled: false, @@ -2158,6 +2165,37 @@ func mergeKEKProviderConfigs(left, right []*structs.KEKProviderConfig) []*struct return results } +func mergeClientFingerprinterConfigs(left, right []*client.Fingerprint) []*client.Fingerprint { + if len(left) == 0 { + return right + } + if len(right) == 0 { + return left + } + results := []*client.Fingerprint{} + doMerge := func(dstConfigs, srcConfigs []*client.Fingerprint) []*client.Fingerprint { + for _, src := range srcConfigs { + var found bool + for i, dst := range dstConfigs { + if dst.Name == src.Name { + dstConfigs[i] = dst.Merge(src) + found = true + break + } + } + if !found { + dstConfigs = append(dstConfigs, src) + } + } + return dstConfigs + } + + results = doMerge(results, left) + results = doMerge(results, right) + + return results +} + // Copy returns a deep copy safe for mutation. func (c *Config) Copy() *Config { if c == nil { @@ -2883,6 +2921,10 @@ func (c *ClientConfig) Merge(b *ClientConfig) *ClientConfig { result.IntroToken = b.IntroToken } + if b.Fingerprinters != nil { + result.Fingerprinters = mergeClientFingerprinterConfigs(c.Fingerprinters, b.Fingerprinters) + } + return &result } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index afd023c667f..6bc3fc9717e 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -244,6 +244,12 @@ func ParseConfigFile(path string) (*Config, error) { fmt.Sprintf("audit.sink.%d", i), &sink.RotateDuration, &sink.RotateDurationHCL, nil}) } + // Add fingerprint retry_interval for time.Duration parsing + for _, fp := range c.Client.Fingerprinters { + tds = append(tds, durationConversionMap{ + fmt.Sprintf("client.fingerprint.%s.retry_interval", fp.Name), &fp.RetryInterval, &fp.RetryIntervalHCL, nil}) + } + // convert strings to time.Durations err = convertDurations(tds) if err != nil { @@ -374,6 +380,16 @@ func extraKeys(c *Config) error { c.ExtraKeysHCL = slices.DeleteFunc(c.ExtraKeysHCL, func(s string) bool { return s == "vault" }) c.ExtraKeysHCL = slices.DeleteFunc(c.ExtraKeysHCL, func(s string) bool { return s == "consul" }) + // The fingerprinter labels will be added to the ExtraKeysHCL slice by + // hcl.Decode, so we need to remove them here. + // + // When parsing JSON, each block will also add "fingerprint" to the + // ExtraKeysHCL slice, so we need to remove that as well. + for _, p := range c.Client.Fingerprinters { + helper.RemoveEqualFold(&c.Client.ExtraKeysHCL, p.Name) + helper.RemoveEqualFold(&c.Client.ExtraKeysHCL, "fingerprint") + } + if len(c.ExtraKeysHCL) == 0 { c.ExtraKeysHCL = nil } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 021cacaf200..fa174e28de3 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + client "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" @@ -98,6 +99,15 @@ var basicConfig = &Config{ BridgeNetworkName: "custom_bridge_name", BridgeNetworkSubnet: "custom_bridge_subnet", BridgeNetworkSubnetIPv6: "custom_bridge_subnet_ipv6", + Fingerprinters: []*client.Fingerprint{ + { + Name: "env_aws", + RetryInterval: 1 * time.Second, + RetryIntervalHCL: "1s", + RetryAttempts: 3, + ExitOnFailure: pointer.Of(true), + }, + }, }, Server: &ServerConfig{ Enabled: true, @@ -513,6 +523,7 @@ func TestConfig_ParseMerge(t *testing.T) { // The Vault connection retry interval is an internal only configuration // option, and therefore needs to be added here to ensure the test passes. actual.Vaults[0].ConnectionRetryIntv = config.DefaultVaultConnectRetryIntv + must.Eq(t, basicConfig.Client, actual.Client) must.Eq(t, basicConfig, actual) oldDefault := &Config{ @@ -1212,3 +1223,66 @@ func TestConfig_Template(t *testing.T) { }) } } + +func TestConfig_Fingerprint(t *testing.T) { + ci.Parallel(t) + + for _, suffix := range []string{"hcl", "json"} { + t.Run(suffix, func(t *testing.T) { + cfg := DefaultConfig() + fc, err := LoadConfig("testdata/fingerprint." + suffix) + must.NoError(t, err) + cfg = cfg.Merge(fc) + + must.True(t, cfg.Client.Enabled) + must.Len(t, 4, cfg.Client.Fingerprinters) + + var awsConfig, azureConfig, gceConfig, doConfig *client.Fingerprint + + for _, fp := range cfg.Client.Fingerprinters { + switch fp.Name { + case "env_aws": + awsConfig = fp + case "env_azure": + azureConfig = fp + case "env_gce": + gceConfig = fp + case "env_digitalocean": + doConfig = fp + default: + t.Fatalf("unexpected fingerprint name: %s", fp.Name) + } + } + + must.NotNil(t, awsConfig) + must.Eq(t, "env_aws", awsConfig.Name) + must.Eq(t, 5*time.Minute, awsConfig.RetryInterval) + must.Eq(t, "5m", awsConfig.RetryIntervalHCL) + must.Eq(t, 3, awsConfig.RetryAttempts) + must.NotNil(t, awsConfig.ExitOnFailure) + must.True(t, *awsConfig.ExitOnFailure) + + must.NotNil(t, azureConfig) + must.Eq(t, "env_azure", azureConfig.Name) + must.Eq(t, 10*time.Minute, azureConfig.RetryInterval) + must.Eq(t, "10m", azureConfig.RetryIntervalHCL) + must.Eq(t, 5, azureConfig.RetryAttempts) + must.NotNil(t, azureConfig.ExitOnFailure) + must.False(t, *azureConfig.ExitOnFailure) + + must.NotNil(t, gceConfig) + must.Eq(t, "env_gce", gceConfig.Name) + must.Eq(t, 2*time.Minute, gceConfig.RetryInterval) + must.Eq(t, "2m", gceConfig.RetryIntervalHCL) + must.Eq(t, -1, gceConfig.RetryAttempts) + must.Nil(t, gceConfig.ExitOnFailure) + + must.NotNil(t, doConfig) + must.Eq(t, "env_digitalocean", doConfig.Name) + must.Eq(t, 1*time.Minute, doConfig.RetryInterval) + must.Eq(t, "1m", doConfig.RetryIntervalHCL) + must.Eq(t, 0, doConfig.RetryAttempts) + must.Nil(t, doConfig.ExitOnFailure) + }) + } +} diff --git a/command/agent/config_test.go b/command/agent/config_test.go index a8b19d1fdce..14dfa5d7ca5 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -2000,6 +2000,55 @@ func Test_mergeKEKProviderConfigs(t *testing.T) { }, result) } +func Test_mergeClientFingerprinterConfigs(t *testing.T) { + ci.Parallel(t) + + left := []*client.Fingerprint{ + { + Name: "env_aws", + RetryAttempts: 2, + }, + { + Name: "env_gce", + ExitOnFailure: pointer.Of(false), + }, + } + right := []*client.Fingerprint{ + { + Name: "env_aws", + RetryInterval: 10 * time.Second, + }, + { + Name: "env_gce", + RetryAttempts: 10, + RetryInterval: 10 * time.Second, + ExitOnFailure: pointer.Of(true), + }, + { + Name: "env_azure", + ExitOnFailure: pointer.Of(true), + }, + } + + must.Eq(t, []*client.Fingerprint{ + { + Name: "env_aws", + RetryAttempts: 2, + RetryInterval: 10 * time.Second, + }, + { + Name: "env_gce", + RetryAttempts: 10, + RetryInterval: 10 * time.Second, + ExitOnFailure: pointer.Of(true), + }, + { + Name: "env_azure", + ExitOnFailure: pointer.Of(true), + }, + }, mergeClientFingerprinterConfigs(left, right)) +} + func TestConfig_LoadClientNodeMaxAllocs(t *testing.T) { ci.Parallel(t) testCases := []struct { diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index cd004612fb0..9c0ccbceaf4 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -107,6 +107,12 @@ client { bridge_network_name = "custom_bridge_name" bridge_network_subnet = "custom_bridge_subnet" bridge_network_subnet_ipv6 = "custom_bridge_subnet_ipv6" + + fingerprint "env_aws" { + retry_interval = "1s" + retry_attempts = 3 + exit_on_failure = true + } } server { diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index ce7b44477e0..7cc5f24750e 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -143,6 +143,17 @@ "a.b.c:80", "127.0.0.1:1234" ], + "fingerprint": [ + { + "env_aws": [ + { + "retry_interval": "1s", + "retry_attempts": 3, + "exit_on_failure": true + } + ] + } + ], "state_dir": "/tmp/client-state", "stats": [ { diff --git a/command/agent/testdata/fingerprint.hcl b/command/agent/testdata/fingerprint.hcl new file mode 100644 index 00000000000..369fba76943 --- /dev/null +++ b/command/agent/testdata/fingerprint.hcl @@ -0,0 +1,27 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +client { + enabled = true + + fingerprint "env_aws" { + retry_interval = "5m" + retry_attempts = 3 + exit_on_failure = true + } + + fingerprint "env_azure" { + retry_interval = "10m" + retry_attempts = 5 + exit_on_failure = false + } + + fingerprint "env_gce" { + retry_interval = "2m" + retry_attempts = -1 + } + + fingerprint "env_digitalocean" { + retry_interval = "1m" + } +} diff --git a/command/agent/testdata/fingerprint.json b/command/agent/testdata/fingerprint.json new file mode 100644 index 00000000000..d1aa75b3d6e --- /dev/null +++ b/command/agent/testdata/fingerprint.json @@ -0,0 +1,40 @@ +{ + "client": { + "enabled": true, + "fingerprint": [ + { + "env_aws": [ + { + "retry_interval": "5m", + "retry_attempts": 3, + "exit_on_failure": true + } + ] + }, + { + "env_azure": [ + { + "retry_interval": "10m", + "retry_attempts": 5, + "exit_on_failure": false + } + ] + }, + { + "env_gce": [ + { + "retry_interval": "2m", + "retry_attempts": -1 + } + ] + }, + { + "env_digitalocean": [ + { + "retry_interval": "1m" + } + ] + } + ] + } +}