-
Notifications
You must be signed in to change notification settings - Fork 874
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: remove acl-check and cancel instead when REPLCONF ACK fails to validate #2920
Changes from all commits
4ba6329
1338315
e4a001c
4aaf734
480bb79
c9f6748
a5ca313
7f6c78e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1147,7 +1147,12 @@ void Service::DispatchCommand(CmdArgList args, facade::ConnectionContext* cntx) | |
// Bonus points because this allows to continue replication with ACL users who got | ||
// their access revoked and reinstated | ||
if (cid->name() == "REPLCONF" && absl::EqualsIgnoreCase(ArgS(args_no_cmd, 0), "ACK")) { | ||
LOG(ERROR) << "Tried to reply to REPLCONF"; | ||
auto info_ptr = server_family_.GetReplicaInfo(dfly_cntx); | ||
if (info_ptr) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dont you need to call DflyCmd::CancelReplication? |
||
unsigned session_id = dfly_cntx->conn_state.replication_info.repl_session_id; | ||
DCHECK(session_id); | ||
server_family_.GetDflyCmd()->CancelReplication(session_id, std::move(info_ptr)); | ||
} | ||
return; | ||
} | ||
dfly_cntx->SendError(std::move(*err)); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,7 +78,6 @@ Replica::Replica(string host, uint16_t port, Service* se, std::string_view id, | |
Replica::~Replica() { | ||
sync_fb_.JoinIfNeeded(); | ||
acks_fb_.JoinIfNeeded(); | ||
acl_check_fb_.JoinIfNeeded(); | ||
} | ||
|
||
static const char kConnErr[] = "could not connect to master: "; | ||
|
@@ -150,7 +149,6 @@ void Replica::Stop() { | |
// so we can freely release resources (connections). | ||
sync_fb_.JoinIfNeeded(); | ||
acks_fb_.JoinIfNeeded(); | ||
acl_check_fb_.JoinIfNeeded(); | ||
} | ||
|
||
void Replica::Pause(bool pause) { | ||
|
@@ -625,8 +623,6 @@ error_code Replica::ConsumeRedisStream() { | |
error_code Replica::ConsumeDflyStream() { | ||
// Set new error handler that closes flow sockets. | ||
auto err_handler = [this](const auto& ge) { | ||
// Trigger acl-checker | ||
replica_waker_.notifyAll(); | ||
// Make sure the flows are not in a state transition | ||
lock_guard lk{flows_op_mu_}; | ||
DefaultErrorHandler(ge); | ||
|
@@ -655,12 +651,7 @@ error_code Replica::ConsumeDflyStream() { | |
shard_set->pool()->AwaitFiberOnAll(std::move(shard_cb)); | ||
} | ||
|
||
if (master_context_.version >= DflyVersion::VER3) { | ||
acl_check_fb_ = fb2::Fiber("acl-check", &Replica::AclCheckFb, this); | ||
} | ||
|
||
JoinDflyFlows(); | ||
acl_check_fb_.JoinIfNeeded(); | ||
|
||
last_journal_LSNs_.emplace(); | ||
for (auto& flow : shard_flows_) { | ||
|
@@ -893,53 +884,6 @@ void Replica::RedisStreamAcksFb() { | |
} | ||
} | ||
|
||
class AclCheckerClient : public ProtocolClient { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This introduces a small bug if master is an older version that we cover in our tests. The problem is that if ACL of the I guess this setup is more probably since upgrading from one version to the other would involve a replica copying the data and then being promoted to master. An intermediate solution would be to keep this check for older version masters and deprecate/completely remove at some later point. I am open for suggestions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is fine, the use-case is not interesting. |
||
public: | ||
AclCheckerClient(ServerContext server, Context* cntx) | ||
: ProtocolClient(std::move(server)), cntx_(cntx) { | ||
Connect(); | ||
} | ||
|
||
void CheckAclRoundTrip() { | ||
if (auto ec = SendCommandAndReadResponse(StrCat("REPLCONF acl-check ", "0")); ec) { | ||
cntx_->Cancel(); | ||
LOG(INFO) << "Error in REPLCONF acl-check: " << ec.message(); | ||
} else if (!CheckRespIsSimpleReply("OK")) { | ||
cntx_->Cancel(); | ||
LOG(INFO) << "Error: " << ToSV(LastResponseArgs().front().GetBuf()); | ||
} | ||
} | ||
|
||
private: | ||
void Connect() { | ||
VLOG(1) << "Connecting with acl client"; | ||
auto ec = ConnectAndAuth(absl::GetFlag(FLAGS_master_connect_timeout_ms) * 1ms, cntx_); | ||
if (ec) { | ||
LOG(INFO) << "Failed to connect with acl client " << ec.message(); | ||
cntx_->Cancel(); | ||
} | ||
} | ||
|
||
Context* cntx_; | ||
}; | ||
|
||
void Replica::AclCheckFb() { | ||
// We need a new client with a different socket for acl-checks | ||
// instead of using the ACK's fiber. This is because acks should | ||
// not be replied (which makes them unusable for periodic ACL checks). | ||
// Also there are N ACK fibers per replica instance while we only need | ||
// one fiber to periodically check for ACL changes. Therefore, | ||
// we decouple the logic via AclCheckFb. | ||
AclCheckerClient acl_client(server(), &cntx_); | ||
|
||
while (!cntx_.IsCancelled()) { | ||
acl_client.CheckAclRoundTrip(); | ||
// We poll for ACL changes every second | ||
replica_waker_.await_until([&]() { return cntx_.IsCancelled(); }, | ||
std::chrono::steady_clock::now() + std::chrono::seconds(1)); | ||
} | ||
} | ||
|
||
void DflyShardReplica::StableSyncDflyAcksFb(Context* cntx) { | ||
constexpr size_t kAckRecordMaxInterval = 1024; | ||
std::chrono::duration ack_time_max_interval = | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2599,6 +2599,7 @@ void ServerFamily::ReplConf(CmdArgList args, ConnectionContext* cntx) { | |
cntx->replication_flow->last_acked_lsn = ack; | ||
return; | ||
} else if (cmd == "ACL-CHECK") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This breaks replication when replica is of lower version. My two cents is that we don't care for this case since this setup doesn't really make much sense. Let me know if you disagree There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please keep the code in master and add a TODO with your anniversary date to remove this code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will add an alarm on my calendar as well 😉 |
||
// TODO(kostasrim): Remove this branch 20/6/2024 | ||
cntx->SendOk(); | ||
return; | ||
} else { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@adiholden please help me reviewing these changes