Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v1.13 without the deny precedence patch + ipsec #32459

Draft
wants to merge 13 commits into
base: v1.13
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,8 @@ endif
check-sources:
@$(ECHO_CHECK) pkg/datapath/loader/check-sources.sh
$(QUIET) pkg/datapath/loader/check-sources.sh
@$(ECHO_CHECK) contrib/scripts/check-xfrmstate.sh
$(QUIET) contrib/scripts/check-xfrmstate.sh

pprof-heap: ## Get Go pprof heap profile.
$(QUIET)$(GO) tool pprof http://localhost:6060/debug/pprof/heap
Expand Down
56 changes: 56 additions & 0 deletions contrib/scripts/check-xfrmstate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# Copyright Authors of Cilium

set -eu

GOIMPORTS=("golang.org/x/tools/cmd/goimports" "-w")
MATCHES=(
"netlink.XfrmStateAdd"
"netlink.XfrmStateUpdate"
"netlink.XfrmStateDel"
"netlink.XfrmStateFlush"
)

find_match() {
local target="./pkg/datapath"

MATCHES_ORED=$(printf "|%s" "${MATCHES[@]}")
MATCHES_ORED=${MATCHES_ORED:1}

grep -lr --include \*.go \
--exclude \*_test.go \
--exclude xfrm_state_cache.go \
--exclude probe_linux.go \
-E "$MATCHES_ORED" \
"$target"
if [ $? -eq 0 ] ; then
return 0
fi

# Let's check now that we can detect it correctly
NO_MATCHES=$(grep -or --include xfrm_state_cache.go \
-E "$MATCHES_ORED" \
"$target" | wc -l)

if [ "$NO_MATCHES" -eq "4" ] ; then
return 1
fi
# Incorrect number of matches
echo "Found incorrect number of matches: $NO_MATCHES"
return 0
}

check() {
if find_match ; then
# shellcheck disable=SC2145
>&2 echo "Found match for '${MATCHES[@]}'. Please use xfrmStateCache instead.";
exit 1
fi
}

main() {
check
}

main "$@"
4 changes: 4 additions & 0 deletions daemon/cmd/daemon_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,10 @@ func initializeFlags() {
flags.Bool(option.EnableIPsecKeyWatcher, defaults.EnableIPsecKeyWatcher, "Enable watcher for IPsec key. If disabled, a restart of the agent will be necessary on key rotations.")
option.BindEnv(Vp, option.EnableIPsecKeyWatcher)

flags.Bool(option.EnableIPSecXfrmStateCaching, defaults.EnableIPSecXfrmStateCaching, "Enable XfrmState cache for IPSec. Significantly reduces CPU usage in large clusters.")
flags.MarkHidden(option.EnableIPSecXfrmStateCaching)
option.BindEnv(Vp, option.EnableIPSecXfrmStateCaching)

flags.Bool(option.EnableWireguard, false, "Enable wireguard")
option.BindEnv(Vp, option.EnableWireguard)

Expand Down
111 changes: 91 additions & 20 deletions pkg/datapath/linux/ipsec/ipsec_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ type ipSecKey struct {
Aead *netlink.XfrmStateAlgo
}

type oldXfrmStateKey struct {
Spi int
Dst [16]byte
}

var (
ipSecLock lock.RWMutex

Expand Down Expand Up @@ -133,6 +138,19 @@ var (
// we've added the catch-all default-drop policy.
removeStaleIPv4XFRMOnce sync.Once
removeStaleIPv6XFRMOnce sync.Once

oldXFRMInMark *netlink.XfrmMark = &netlink.XfrmMark{
Value: linux_defaults.RouteMarkDecrypt,
Mask: linux_defaults.IPsecMarkBitMask,
}
// xfrmStateCache is a cache of XFRM states to avoid querying each time.
// This is especially important for backgroundSync that is used to validate
// if the XFRM state is correct, without usually modyfing anything.
// The cache is invalidated whenever a new XFRM state is added/updated/removed,
// but also in case of TTL expiration.
// It provides XfrmStateAdd/Update/Del wrappers that ensure cache
// is correctly invalidate.
xfrmStateCache = NewXfrmStateListCache(time.Minute)
)

func getGlobalIPsecKey(ip net.IP) *ipSecKey {
Expand Down Expand Up @@ -273,7 +291,7 @@ func ipSecAttachPolicyTempl(policy *netlink.XfrmPolicy, keys *ipSecKey, srcIP, d
// already exist. If it doesn't but some other XFRM state conflicts, then
// we attempt to remove the conflicting state before trying to add again.
func xfrmStateReplace(new *netlink.XfrmState, remoteRebooted bool) error {
states, err := netlink.XfrmStateList(netlink.FAMILY_ALL)
states, err := xfrmStateCache.XfrmStateList()
if err != nil {
return fmt.Errorf("Cannot get XFRM state: %s", err)
}
Expand All @@ -295,13 +313,13 @@ func xfrmStateReplace(new *netlink.XfrmState, remoteRebooted bool) error {
// encryption key changed. This is expected on upgrade because
// we changed the way we compute the per-node-pair key.
scopedLog.Info("Removing XFRM state with old IPsec key")
netlink.XfrmStateDel(&s)
xfrmStateCache.XfrmStateDel(&s)
break
}
if !xfrmMarkEqual(s.OutputMark, new.OutputMark) {
// If only the output-marks differ, then we should be able
// to simply update the XFRM state atomically.
return netlink.XfrmStateUpdate(new)
return xfrmStateCache.XfrmStateUpdate(new)
}
if remoteRebooted && new.ESN {
// This should happen only when a node reboots when the boot ID
Expand All @@ -315,7 +333,7 @@ func xfrmStateReplace(new *netlink.XfrmState, remoteRebooted bool) error {
// packets if the state is missing. At most we will drop a
// few encrypted packets while updating.
scopedLog.Info("Non-atomically updating IPsec XFRM state due to remote boot ID change")
netlink.XfrmStateDel(&s)
xfrmStateCache.XfrmStateDel(&s)
break
}
return nil
Expand All @@ -326,10 +344,6 @@ func xfrmStateReplace(new *netlink.XfrmState, remoteRebooted bool) error {
Value: ipSecXfrmMarkSetSPI(linux_defaults.RouteMarkEncrypt, uint8(new.Spi)),
Mask: linux_defaults.IPsecOldMarkMaskOut,
}
oldXFRMInMark := &netlink.XfrmMark{
Value: linux_defaults.RouteMarkDecrypt,
Mask: linux_defaults.IPsecMarkBitMask,
}
for _, s := range states {
// This is either the XFRM OUT state or the XFRM IN state from a
// previous Cilium version. Because their marks match the new mark
Expand Down Expand Up @@ -358,13 +372,13 @@ func xfrmStateReplace(new *netlink.XfrmState, remoteRebooted bool) error {
continue
}

err := netlink.XfrmStateDel(&s)
err := xfrmStateCache.XfrmStateDel(&s)
if err != nil {
scopedLog.WithError(err).Errorf("Failed to remove old XFRM %s state", dir)
} else {
scopedLog.Infof("Temporarily removed old XFRM %s state", dir)
defer func(oldXFRMState netlink.XfrmState, dir string) {
if err := netlink.XfrmStateAdd(&oldXFRMState); err != nil {
if err := xfrmStateCache.XfrmStateAdd(&oldXFRMState); err != nil {
scopedLog.WithError(err).Errorf("Failed to re-add old XFRM %s state", dir)
}
}(s, dir)
Expand All @@ -373,7 +387,7 @@ func xfrmStateReplace(new *netlink.XfrmState, remoteRebooted bool) error {
}

// It doesn't exist so let's attempt to add it.
firstAttemptErr := netlink.XfrmStateAdd(new)
firstAttemptErr := xfrmStateCache.XfrmStateAdd(new)
if !os.IsExist(firstAttemptErr) {
return firstAttemptErr
}
Expand All @@ -391,7 +405,7 @@ func xfrmStateReplace(new *netlink.XfrmState, remoteRebooted bool) error {
if !deletedSomething {
return firstAttemptErr
}
return netlink.XfrmStateAdd(new)
return xfrmStateCache.XfrmStateAdd(new)
}

// Attempt to remove any XFRM state that conflicts with the state we just tried
Expand All @@ -404,7 +418,7 @@ func xfrmDeleteConflictingState(states []netlink.XfrmState, new *netlink.XfrmSta
if new.Spi == s.Spi && (new.Mark == nil) == (s.Mark == nil) &&
(new.Mark == nil || new.Mark.Value&new.Mark.Mask&s.Mark.Mask == s.Mark.Value) &&
xfrmIPEqual(new.Src, s.Src) && xfrmIPEqual(new.Dst, s.Dst) {
err := netlink.XfrmStateDel(&s)
err := xfrmStateCache.XfrmStateDel(&s)
if err == nil {
deletedSomething = true
log.WithFields(logrus.Fields{
Expand Down Expand Up @@ -698,18 +712,75 @@ func ipsecDeleteXfrmState(nodeID uint16) {
logfields.NodeID: nodeID,
})

xfrmStateList, err := netlink.XfrmStateList(netlink.FAMILY_ALL)
xfrmStateList, err := xfrmStateCache.XfrmStateList()
if err != nil {
scopedLog.WithError(err).Warning("Failed to list XFRM states for deletion")
return
}
xfrmStatesToDelete := []netlink.XfrmState{}
oldXfrmInStates := map[oldXfrmStateKey]netlink.XfrmState{}
for _, s := range xfrmStateList {
if matchesOnNodeID(s.Mark) && ipsec.GetNodeIDFromXfrmMark(s.Mark) == nodeID {
if err := netlink.XfrmStateDel(&s); err != nil {
scopedLog.WithError(err).Warning("Failed to delete XFRM state")
xfrmStatesToDelete = append(xfrmStatesToDelete, s)
}
if xfrmMarkEqual(s.Mark, oldXFRMInMark) {
key := oldXfrmStateKey{
Spi: s.Spi,
Dst: [16]byte(s.Dst.To16()),
}
oldXfrmInStates[key] = s
}
}
for _, s := range xfrmStatesToDelete {
key := oldXfrmStateKey{
Spi: s.Spi,
Dst: [16]byte(s.Dst.To16()),
}
var oldXfrmInState *netlink.XfrmState = nil
old, ok := oldXfrmInStates[key]
if ok {
oldXfrmInState = &old
}
if err := safeDeleteXfrmState(&s, oldXfrmInState); err != nil {
scopedLog.WithError(err).Warning("Failed to delete XFRM state")
}
}
}

// safeDeleteXfrmState deletes the given XFRM state. Specifically, if the
// state is to catch ingress traffic marked with nodeID (0xXXXX0d00), we
// temporarily remove the old XFRM state that matches 0xd00/0xf00. This is to
// workaround a kernel issue that prevents us from deleting a specific XFRM
// state (e.g. catching 0xXXXX0d00/0xffff0f00) when there is also a general
// xfrm state (e.g. catching 0xd00/0xf00). When both XFRM states coexist,
// kernel deletes the general XFRM state instead of the specific one, even if
// the deleting request is for the specific one.
func safeDeleteXfrmState(state *netlink.XfrmState, oldState *netlink.XfrmState) (err error) {

if getDirFromXfrmMark(state.Mark) == dirIngress && ipsec.GetNodeIDFromXfrmMark(state.Mark) != 0 && oldState != nil {

scopedLog := log.WithFields(logrus.Fields{
logfields.SPI: state.Spi,
logfields.SourceIP: state.Src,
logfields.DestinationIP: state.Dst,
logfields.TrafficDirection: getDirFromXfrmMark(state.Mark),
logfields.NodeID: getNodeIDAsHexFromXfrmMark(state.Mark),
})

err := xfrmStateCache.XfrmStateDel(oldState)
if err != nil {
scopedLog.WithError(err).Errorf("Failed to remove old XFRM %s state", string(dirIngress))
} else {
scopedLog.Infof("Temporarily removed old XFRM %s state", string(dirIngress))
defer func(oldXFRMState *netlink.XfrmState, dir string) {
if err := xfrmStateCache.XfrmStateAdd(oldXFRMState); err != nil {
scopedLog.WithError(err).Errorf("Failed to re-add old XFRM %s state", dir)
}
}(oldState, string(dirIngress))
}
}

return xfrmStateCache.XfrmStateDel(state)
}

func ipsecDeleteXfrmPolicy(nodeID uint16) {
Expand Down Expand Up @@ -878,11 +949,11 @@ func DeleteXfrm() {
}
}
}
xfrmStateList, err := netlink.XfrmStateList(netlink.FAMILY_ALL)
xfrmStateList, err := xfrmStateCache.XfrmStateList()
if err == nil {
for _, s := range xfrmStateList {
if isXfrmStateCilium(s) {
if err := netlink.XfrmStateDel(&s); err != nil {
if err := xfrmStateCache.XfrmStateDel(&s); err != nil {
log.WithError(err).Warning("deleting old xfrm state failed")
}
}
Expand Down Expand Up @@ -1189,7 +1260,7 @@ func ipSecSPICanBeReclaimed(spi uint8, reclaimTimestamp time.Time) bool {
func deleteStaleXfrmStates(reclaimTimestamp time.Time) {
scopedLog := log.WithField(logfields.SPI, ipSecCurrentKeySPI)

xfrmStateList, err := netlink.XfrmStateList(netlink.FAMILY_ALL)
xfrmStateList, err := xfrmStateCache.XfrmStateList()
if err != nil {
scopedLog.WithError(err).Warning("Failed to list XFRM states")
return
Expand All @@ -1205,7 +1276,7 @@ func deleteStaleXfrmStates(reclaimTimestamp time.Time) {
scopedLog = log.WithField(logfields.OldSPI, stateSPI)

scopedLog.Info("Deleting stale XFRM state")
if err := netlink.XfrmStateDel(&s); err != nil {
if err := xfrmStateCache.XfrmStateDel(&s); err != nil {
scopedLog.WithError(err).Warning("Deleting stale XFRM state failed")
}
}
Expand Down
80 changes: 80 additions & 0 deletions pkg/datapath/linux/ipsec/xfrm_state_cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

package ipsec

import (
"time"

"github.com/vishvananda/netlink"
"k8s.io/utils/clock"

"github.com/cilium/cilium/pkg/lock"
"github.com/cilium/cilium/pkg/option"
)

type xfrmStateListCache struct {
stateList []netlink.XfrmState
timeout time.Time
mutex lock.Mutex
ttl time.Duration
clock clock.PassiveClock
}

func NewXfrmStateListCache(ttl time.Duration) *xfrmStateListCache {
return &xfrmStateListCache{
ttl: ttl,
clock: clock.RealClock{},
}
}

func (c *xfrmStateListCache) XfrmStateList() ([]netlink.XfrmState, error) {
c.mutex.Lock()
defer c.mutex.Unlock()
if c.isExpired() {
result, err := netlink.XfrmStateList(netlink.FAMILY_ALL)
if err != nil {
return nil, err
}
c.stateList = result
c.timeout = c.clock.Now().Add(c.ttl)
}
return c.stateList, nil
}

func (c *xfrmStateListCache) XfrmStateDel(state *netlink.XfrmState) error {
c.invalidate()
return netlink.XfrmStateDel(state)
}

func (c *xfrmStateListCache) XfrmStateUpdate(state *netlink.XfrmState) error {
c.invalidate()
return netlink.XfrmStateUpdate(state)
}

func (c *xfrmStateListCache) XfrmStateAdd(state *netlink.XfrmState) error {
c.invalidate()
return netlink.XfrmStateAdd(state)
}

func (c *xfrmStateListCache) XfrmStateFlush(proto netlink.Proto) error {
c.invalidate()
return netlink.XfrmStateFlush(proto)
}

func (c *xfrmStateListCache) isExpired() bool {
return !option.Config.EnableIPSecXfrmStateCaching || c.stateList == nil || c.timeout.Before(c.clock.Now())
}

func (c *xfrmStateListCache) invalidate() {
c.mutex.Lock()
defer c.mutex.Unlock()
c.stateList = nil
}

func newTestableXfrmStateListCache(ttl time.Duration, clock clock.PassiveClock) *xfrmStateListCache {
return &xfrmStateListCache{
ttl: ttl,
clock: clock,
}
}
Loading
Loading