-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
OCPBUGS-18534: monitor test fix to wait before connecting to a non-existent dns on PowerVS and IBMCloud platforms #28739
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ import ( | |
"net/http" | ||
"os" | ||
"strconv" | ||
"strings" | ||
"time" | ||
|
||
"github.com/openshift/origin/pkg/monitortestframework" | ||
|
@@ -203,6 +204,15 @@ func (w *availability) StartCollection(ctx context.Context, adminRESTConfig *res | |
return fmt.Errorf("error creating PDB: %w", err) | ||
} | ||
|
||
// On certain platforms hitting the hostname before it is ready leads to a blackhole, this code checks | ||
// the host from the cluster's context | ||
if infra.Spec.PlatformSpec.Type == configv1.PowerVSPlatformType || infra.Spec.PlatformSpec.Type == configv1.IBMCloudPlatformType { | ||
nodeTgt := "node/" + nodeList.Items[0].ObjectMeta.Name | ||
if err := checkHostnameReady(ctx, tcpService, nodeTgt); err != nil { | ||
return err | ||
} | ||
} | ||
|
||
// Hit it once before considering ourselves ready | ||
fmt.Fprintf(os.Stderr, "hitting pods through the service's LoadBalancer\n") | ||
timeout := 10 * time.Minute | ||
|
@@ -344,3 +354,29 @@ func httpGetNoConnectionPoolTimeout(url string, timeout time.Duration) (*http.Re | |
|
||
return client.Get(url) | ||
} | ||
|
||
// Uses the first node in the cluster to verify the LoadBalancer host is active before returning | ||
func checkHostnameReady(ctx context.Context, tcpService *corev1.Service, nodeTgt string) error { | ||
oc := exutil.NewCLIForMonitorTest(tcpService.GetObjectMeta().GetNamespace()) | ||
|
||
var ( | ||
stdOut string | ||
err error | ||
) | ||
|
||
wait.PollUntilContextTimeout(ctx, 15*time.Second, 60*time.Minute, true, func(ctx context.Context) (bool, error) { | ||
logrus.Debug("Checking load balancer host is active \n") | ||
stdOut, _, err = oc.AsAdmin().WithoutNamespace().RunInMonitorTest("debug").Args(nodeTgt, "--", "dig", "+short", "+notcp", tcpService.Status.LoadBalancer.Ingress[0].Hostname).Outputs() | ||
if err != nil { | ||
return false, nil | ||
} | ||
output := strings.TrimSpace(stdOut) | ||
if output == "" { | ||
logrus.Debug("Waiting for the load balancer to become active") | ||
return false, nil | ||
} | ||
logrus.Debug("Load balancer active") | ||
return true, nil | ||
}) | ||
return err | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not trying to be difficult. Looks like Poll is deprecated. Also I think if you pass in the context from the StartCollection call and use it in PollUntilContextTimeout you won't block if that ctx gets canceled and you can set an explicit max time for your checkHostnameReady function to block for. I haven't tried it but the patch is attached if you want to.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure - we can look at it |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,6 +179,32 @@ func NewCLIWithoutNamespace(project string) *CLI { | |
return cli | ||
} | ||
|
||
// NewCLIForMonitorTest initializes the CLI and Kube framework helpers | ||
// without a namespace. Should be called outside of a Ginkgo .It() | ||
// function. | ||
func NewCLIForMonitorTest(project string) *CLI { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you needed new non ginko versions of this and Run to support running in the monitor test environment? Thanks for setting those up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct |
||
cli := &CLI{ | ||
kubeFramework: &framework.Framework{ | ||
SkipNamespaceCreation: true, | ||
BaseName: project, | ||
Options: framework.Options{ | ||
ClientQPS: 20, | ||
ClientBurst: 50, | ||
}, | ||
Timeouts: framework.NewTimeoutContext(), | ||
}, | ||
username: "admin", | ||
execPath: "oc", | ||
adminConfigPath: KubeConfigPath(), | ||
staticConfigManifestDir: StaticConfigManifestDir(), | ||
withoutNamespace: true, | ||
} | ||
|
||
// Called only once (assumed the objects will never get modified) | ||
cli.setupStaticConfigsFromManifests() | ||
return cli | ||
} | ||
|
||
// NewHypershiftManagementCLI returns a CLI that interacts with the Hypershift management cluster. | ||
// Contrary to a normal CLI it does not perform any cleanup, and it must not be used for any mutating | ||
// operations. Also, contrary to a normal CLI it must be constructed inside an `It` block. This is | ||
|
@@ -813,6 +839,22 @@ func (c *CLI) Run(commands ...string) *CLI { | |
return nc.setOutput(c.stdout) | ||
} | ||
|
||
// Executes with the kubeconfig specified from the environment | ||
func (c *CLI) RunInMonitorTest(commands ...string) *CLI { | ||
in, out, errout := &bytes.Buffer{}, &bytes.Buffer{}, &bytes.Buffer{} | ||
nc := &CLI{ | ||
execPath: c.execPath, | ||
verb: commands[0], | ||
kubeFramework: c.KubeFramework(), | ||
adminConfigPath: c.adminConfigPath, | ||
configPath: c.configPath, | ||
username: c.username, | ||
globalArgs: commands, | ||
} | ||
nc.stdin, nc.stdout, nc.stderr = in, out, errout | ||
return nc.setOutput(c.stdout) | ||
} | ||
|
||
// InputString adds expected input to the command | ||
func (c *CLI) InputString(input string) *CLI { | ||
c.stdin.WriteString(input) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just double checking, this is a conscious choice not to capture the err returned from PollUntilContextTime ( could be context closed or timeout, etc.) and only return the last err, if any, returned from oc.AsAdmin call?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it was intentional to not capture the err
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have to admit I'm curious. So this allows for a path where PollUntilContextTimeout returns an error but checkHostNameReady does not and that will allow StartCollection to proceed and presumably error out down the line?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it will error out here right after the checkHostNameReady function call since the service LoadBalancer is not verified
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ahh, thanks for the explanation!