Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option for minimal reboot period #904

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 50 additions & 12 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,12 @@ var (
nodeID string
concurrency int

rebootDays []string
rebootStart string
rebootEnd string
timezone string
annotateNodes bool
rebootDays []string
rebootStart string
rebootEnd string
timezone string
minRebootPeriod time.Duration
annotateNodes bool

// Metrics
rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Expand All @@ -105,6 +106,8 @@ const (
KuredRebootInProgressAnnotation string = "weave.works/kured-reboot-in-progress"
// KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
// KuredLastSuccessfulRebootAnnotation is the canonical string value for the kured last-successful-reboot annotation
KuredLastSuccessfulRebootAnnotation string = "weave.works/kured-last-successful-reboot"
// EnvPrefix The environment variable prefix of all environment variables bound to our command line flags.
EnvPrefix = "KURED"

Expand Down Expand Up @@ -135,7 +138,8 @@ func NewRootCommand() *cobra.Command {
Short: "Kubernetes Reboot Daemon",
PersistentPreRunE: bindViper,
PreRun: flagCheck,
Run: root}
Run: root,
}

rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "",
"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
Expand Down Expand Up @@ -218,6 +222,8 @@ func NewRootCommand() *cobra.Command {
"schedule reboot only before this time of day")
rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC",
"use this timezone for schedule inputs")
rootCmd.PersistentFlags().DurationVar(&minRebootPeriod, "min-reboot-period", 0,
"the minimal duration between reboots of a node. Requires --annotate-nodes")

rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false,
"if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots")
Expand Down Expand Up @@ -265,6 +271,9 @@ func flagCheck(cmd *cobra.Command, args []string) {
if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) {
log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.")
}
if !annotateNodes && minRebootPeriod != 0 {
log.Warn("Cannot use --min-reboot-period without --annotate-nodes. This will ignore the min-reboot-period value")
}
}

// stripQuotes removes any literal single or double quote chars that surround a string
Expand Down Expand Up @@ -317,7 +326,6 @@ func flagToEnvVar(flag string) string {
// buildHostCommand writes a new command to run in the host namespace
// Rancher based need different pid
func buildHostCommand(pid int, command []string) []string {

// From the container, we nsenter into the proper PID to run the hostCommand.
// For this, kured daemonset need to be configured with hostPID:true and privileged:true
cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"}
Expand Down Expand Up @@ -400,7 +408,8 @@ func (kb KubernetesBlockingChecker) isBlocked() bool {
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
LabelSelector: labelSelector,
FieldSelector: fieldSelector,
Limit: 10})
Limit: 10,
})
if err != nil {
log.Warnf("Reboot blocked: pod query error: %v", err)
return true
Expand Down Expand Up @@ -694,6 +703,11 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str
continue
}
}
if minRebootPeriod != 0 {
if err := addNodeAnnotations(client, nodeID, map[string]string{KuredLastSuccessfulRebootAnnotation: time.Now().Format(time.RFC3339)}); err != nil {
continue
}
}
}
throttle(releaseDelay)
release(lock, concurrency > 1)
Expand Down Expand Up @@ -725,16 +739,22 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str
continue
}

node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}

if lastSuccessfulRebootWithinMinRebootPeriod(node) {
log.Infof("Last successful reboot within minimal reboot period")
Copy link
Collaborator

@jackfrancis jackfrancis Mar 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be preferable to log the next time that a reboot will be allowed. So maybe we convert lastSuccessfulRebootWithinMinRebootPeriod() into a "getNextAllowedRebootTime()" func, and then do something like

nextRebootTime := getNextAllowedRebootTime(node)
if time.Now().Before(nextRebootTime) {
    log.Infof("Not allowed to reboot until %s", nextRebootTime)
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

example confuses me. Why add minRebootPeriod to the value returned by getNextAllowedRebootTime?

Either way returning the next allowed reboot time (without adding the minRebootPeriod) or returning the last successful reboot time (and adding minRebootPeriod) sound good to me.

But what should we do if we don't know the last reboot time? Right now we just return "you should reboot" when there is no last successful reboot time. Would the getNextAllowedRebootTime just return &time.Time{} if we don't know about the last successful reboot?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, copy/paste error in example (updated!).

In the instance where we aren't able to determine the last reboot time we can probably return time.Now(), and then update the statement to

if time.Now().Compare(nextRebootTime) < 0 {
    log.Infof("Not allowed to reboot until %s", nextRebootTime)
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you ok with returning and error explicitly? Like

func nextRebootTime(node) (*time.Time, error)

So we don't have to return time.Now() when we don't know.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for example we could do

if minRebootPeriod > 0 {
	if t, err := nextAllowedReboot(node); err != nil {
		log.Warnf("Failed to determine next allowed reboot time: %s", err.Error())
	} else if diff := t.Sub(time.Now()); diff > 0 {
		log.Infof("Reboot not allowed until %s (%s)", t.String(), diff.String())
	}
}

if we don't care about logging the duration (in how much time can we reboot), we should use time.Now().Before(t) instead, I guess. I am really bad with subtracting times in my head.

I see that it would work without returning an error. It would even take less code, but the users would never see the warning about a missing annotation or anything.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm happy with an error, but in the case of the annotation not being found, that's not actually an error, as this is a new feature and plenty of users won't configure kured to use it. So silently ignoring that and returning time.Now() seems reasonable.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would only check the annotation if minRebootPeriod is configured to be larger than 0.

Right, if you suddenly turn on the feature you would get one "wrong" error log entry before the first reboot. After the first reboot with the --minRebootPeriod flag, a missing annotation would actually be a real error.

If we don't care so much about logging that error, I will totally agree to returning time.Now() if something unexpected happens.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, if you suddenly turn on the feature you would get one "wrong" error log entry before the first reboot.

Could you clarify why this is so?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You run cured without the --min-reboot-period flag. Kured will not annotate the node with the weave.works/kured-last-successful-reboot flag. Now you want to use the "new" feature, edit your manifest and roll out the new kured daemon set. Normally none of the pods will hold the lock from

if holding(lock, &nodeMeta, concurrency > 1) {
. That means they won't add the weave.works/kured-last-successful-reboot annotation to its nodes. We wouldn't want that anyways because then restarting/rollilng out kured daemon set would falsify the last successful reboot time. Since the feature is enabled now, kured would check for the weave.works/kured-last-successful-reboot annotation, but fail since it is not there yet. So if we log an error when the annotation is missing, we would see one error log for each tick until the first reboot. That the annotation is missing is expected before the first reboot. However, after the first reboot the missing annotation would be an unexpected error.

Only after the first reboot with the --min-reboot-period flag, this feature would actually work.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure what is right here. Should we never log an error on missing annotations to avoid confusion before the first reboot to the cost of silencing real errors after the first reboot? Or the other way around? (I added a commit that would log the errors, happy to change it though)

continue
}

if !rebootRequired(sentinelCommand) {
log.Infof("Reboot not required")
preferNoScheduleTaint.Disable()
continue
}

node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}
nodeMeta.Unschedulable = node.Spec.Unschedulable

var timeNowString string
Expand Down Expand Up @@ -804,6 +824,24 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str
}
}

func lastSuccessfulRebootWithinMinRebootPeriod(node *v1.Node) bool {
if minRebootPeriod == 0 {
return false
}
if !annotateNodes {
return false
}
if v, ok := node.Annotations[KuredLastSuccessfulRebootAnnotation]; ok {
t, err := time.Parse(time.RFC3339, v)
if err != nil {
log.Warnf("failed to parse time %q in annotation %q: %s", v, KuredLastSuccessfulRebootAnnotation, err.Error())
return false
}
return time.Now().Before(t.Add(minRebootPeriod))
}
return false
}

// buildSentinelCommand creates the shell command line which will need wrapping to escape
// the container boundaries
func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand string) []string {
Expand Down
1 change: 1 addition & 0 deletions kured-ds-signal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,4 @@ spec:
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text
# - --min-reboot-period=336h
1 change: 1 addition & 0 deletions kured-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,4 @@ spec:
# - --metrics-host=""
# - --metrics-port=8080
# - --concurrency=1
# - --min-reboot-period=336h
Loading