Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: Assumeutxo: import snapshot in a node with a divergent chain #29996

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/net_processing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1533,7 +1533,8 @@ void PeerManagerImpl::TryDownloadingHistoricalBlocks(const Peer& peer, unsigned
// complete history beneath the snapshot base.
return;
}

// If the background tip is not an ancestor of the snapshot block, we need to start requesting blocks from their last common ancestor.
from_tip = LastCommonAncestor(from_tip, target_block);
FindNextBlocks(vBlocks, peer, state, from_tip, count, std::min<int>(from_tip->nHeight + BLOCK_DOWNLOAD_WINDOW, target_block->nHeight));
}

Expand Down
52 changes: 45 additions & 7 deletions test/functional/feature_assumeutxo.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,12 @@

- TODO: Valid snapshot file, but referencing a snapshot block that turns out to be
invalid, or has an invalid parent
- TODO: Valid snapshot file and snapshot block, but the block is not on the
most-work chain

Interesting starting states could be loading a snapshot when the current chain tip is:

- TODO: An ancestor of snapshot block
- TODO: Not an ancestor of the snapshot block but has less work
- TODO: The snapshot block
- TODO: A descendant of the snapshot block
- TODO: Not an ancestor or a descendant of the snapshot block and has more work

"""
from shutil import rmtree
Expand All @@ -51,18 +47,20 @@ class AssumeutxoTest(BitcoinTestFramework):

def set_test_params(self):
"""Use the pregenerated, deterministic chain up to height 199."""
self.num_nodes = 3
self.num_nodes = 5
self.rpc_timeout = 120
self.extra_args = [
[],
["-fastprune", "-prune=1", "-blockfilterindex=1", "-coinstatsindex=1"],
["-persistmempool=0","-txindex=1", "-blockfilterindex=1", "-coinstatsindex=1"],
[],
[]
]

def setup_network(self):
"""Start with the nodes disconnected so that one can generate a snapshot
including blocks the other hasn't yet seen."""
self.add_nodes(3)
self.add_nodes(5)
self.start_nodes(extra_args=self.extra_args)

def test_invalid_snapshot_scenarios(self, valid_snapshot_path):
Expand Down Expand Up @@ -204,6 +202,40 @@ def test_snapshot_with_less_work(self, dump_output_path):
assert_raises_rpc_error(-32603, "Unable to load UTXO snapshot", node.loadtxoutset, dump_output_path)
self.restart_node(0, extra_args=self.extra_args[0])

def test_snapshot_in_a_divergent_chain(self, dump_output_path):
n0 = self.nodes[0]
n3 = self.nodes[3]
n4 = self.nodes[4]
assert_equal(n0.getblockcount(), FINAL_HEIGHT)
assert_equal(n3.getblockcount(), START_HEIGHT)
assert_equal(n4.getblockcount(), START_HEIGHT)

self.log.info(f"Check importing a snapshot where current chain-tip is not an ancestor of the snapshot block but has less work")
# Generate a divergent chain in n3 up to 298
self.generate(n3, nblocks=99, sync_fun=self.no_op)
assert_equal(n3.getblockcount(), SNAPSHOT_BASE_HEIGHT - 1)

# Try importing the snapshot and assert its success
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also check that the background validation succeeds. Otherwise there could be a bug where the diverging chain is not rewound?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing this out. The background validation does not seem to finish in this case.

I am now testing with a new node n3 (in my local copy) to avoid the manual rollback to START_HEIGHT (L207). Additionally, sync_blocks() was throwing a timeout when reusing n2 for this test (not sure why).

Here's the new approach I'm trying:

  • The new node starts at START_HEIGHT (199).
  • I generate a divergent chain from START_HEIGHT up to height 298 (< SNAPSHOT_BASE_HEIGHT).
  • I load the snapshot (height=299).

After loading the snapshot, I can see these two chain states:


[{'blocks': 298, 'bestblockhash': '171f1d8af9371c9d54a3731f8befdfa6dd2fe553831970ddffdaeb0b93aa54d3', 'difficulty': Decimal('4.656542373906925E-10'), 'verificationprogress': 1, 'coins_db_cache_bytes': 7969177, 'coins_tip_cache_bytes': 438304768, 'validated': True}, {'blocks': 299, 'bestblockhash': '3bb7ce5eba0be48939b7a521ac1ba9316afee2c7bada3a0cca24188e6d7d96c0', 'difficulty': Decimal('4.656542373906925E-10'), 'verificationprogress': 1, 'coins_db_cache_bytes': 419430, 'coins_tip_cache_bytes': 23068672, 'snapshot_blockhash': '3bb7ce5eba0be48939b7a521ac1ba9316afee2c7bada3a0cca24188e6d7d96c0', 'validated': False}]

Next, I connect the nodes and ensure they all see the same tip:


self.connect_nodes(0, 3)
self.wait_until(lambda: n3.getchainstates()['chainstates'][-1]['blocks'] == FINAL_HEIGHT)
self.sync_blocks(nodes=(self.nodes[0], n3))


After syncing, these are the chain states:

[{'blocks': 298, 'bestblockhash': '171f1d8af9371c9d54a3731f8befdfa6dd2fe553831970ddffdaeb0b93aa54d3', 'difficulty': Decimal('4.656542373906925E-10'), 'verificationprogress': 1, 'coins_db_cache_bytes': 7969177, 'coins_tip_cache_bytes': 438304768, 'validated': True}, {'blocks': 399, 'bestblockhash': '193ad9344966a54125f4b8d3596572c356ca3dcc216b25b91cb0022fbf61c7e1', 'difficulty': Decimal('4.656542373906925E-10'), 'verificationprogress': 1, 'coins_db_cache_bytes': 419430, 'coins_tip_cache_bytes': 23068672, 'snapshot_blockhash': '3bb7ce5eba0be48939b7a521ac1ba9316afee2c7bada3a0cca24188e6d7d96c0', 'validated': False}]

It seems that the snapshot chain has now synced to the tip (height 399). However, this line times out after syncing the blocks:

self.wait_until(lambda: len(n3.getchainstates()['chainstates']) == 1)

I'm not sure if I'm doing something wrong or if there is indeed a bug where the divergent chain is not rewound. I will continue investigating.

Something to note here: if I follow the same process but I don't generate any divergent chain, then the validation completes successfully.

Any directions on how to proceed with this would be appreciated.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I'm doing something wrong or if there is indeed a bug where the divergent chain is not rewound. I will continue investigating.

This sounds like a bug. Just to clarify, the active chain is stuck at height 298 and the background chain continues to sync past 399?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I am confused with the terms "active" and "background" chain. I assume the active chain is the snapshot chain which starts at height 299, and the divergent chain is the one stuck at 298. I base this on the fact that when I run getbestblockhash after loadtxoutset, I get the hash of the snapshot tip. However, I might be mistaken. Shouldn't the divergent chain be rewound to START_HEIGHT and become the background validation chain?

To address your questions:

  1. The divergent chain is indeed stuck at height 298.
  2. The snapshot chain continues to sync past height 399. Although 399 is the FINAL_HEIGHT for this test, I was able to mine an additional 100 blocks on top of node0, resulting in both node0 and node3 syncing again, and the snapshot chain syncing up to height 499.

However, even after syncing past 399, the background validation does not seem to finish. I always get a timeout when running this line after syncing the nodes:

self.wait_until(lambda: len(node.getchainstates()['chainstates']) == 1)

Additionally, I am experiencing an intermittent issue where the sync does not always finish, and I have not been able to determine the cause yet. This issue happens only after mining past 399.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can confirm this behavior and also think that this has uncovered a bug in net_processing.

I think that the root cause is in TryDownloadingHistoricalBlocks (which is responsible for downloading the background chain):
This function calls FindNextBlocks() with pindexWalk set to the current tip of the background chainstate (from_tip), and target_block set to the snapshot block.
FindNextBlocks then walks from the snapshot block backwards to the the height of from_tip, save these blocks in vToFetch and then begins to download these blocks in forward order.
This is incorrect, because the blocks in starting at the last common ancestor of from_tip and the snapshot block, up to the height of from_tip are never requested that way (their height is smaller than the height of from_tip).
So, my proposed fix would be something like mzumsande@edb2b69 (feel free to cherry-pick/ adjust as you like).

@alfonsoromanz: Could you check if that fix would solve the issue for you?
@ryanofsky Could you take a look - would you agree with that explanation and the proposed fix?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re: https://github.com/bitcoin/bitcoin/pull/29996/files#r1633915935

my proposed fix would be something like mzumsande@edb2b69 (feel free to cherry-pick/ adjust as you like).

Nice find! Would suggest opening a separate PR so it is easier to understand the problem and fix. And maybe it is possible to come up with a simpler test for this problem specifically, like by adding an assert in FindNextBlocks() that pindexWalk is an ancestor of state->pindexBestKnownBlock and then adding a test that triggers the assert.

Would also consider tweaking the fix to call LastCommonAncestor() before calling TryDownloadingHistoricalBlocks(), so it is easier to understand from_tip variable being the immediate predecessor of blocks to download next instead of having a more complicated meaning.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re: #29996 (comment)

To clarify the meaning of the Todo:

Interesting starting states could be loading a snapshot when the current chain tip is:

  • TODO: Not an ancestor of the snapshot block but has less work

Particularly "but has less work" could mean A) less work than the tip of the chain that includes the snapshot or B) less work than the snapshot block itself. I believe A is the more interesting and intended scenario because B only seems to be a state a node can be in if the snapshot block is invalid or its ancestors are unknown which should lead to a much earlier error.

I think the todo list came from my comment #27596 (comment), and I probably was thinking of interpretation (B) not (A), but maybe (B) is not a very interesting scenario.

I'm not sure why it needs to "lead to a much earlier error," though, or lead to any error. If the node is syncing to some chain that seems to have the most work, but then headers for a second chain are announced that has more work, the chainstate tip is not going to switch to the second chain, even though it has more work, until enough blocks from it are downloaded and validated, and a block from the second chain is reached that is that is valid and has more work than the chainstate tip. Before that happens, a snapshot from the second chain could be loaded such that the current chain tip has less work than the snapshot block and is not an ancestor of the snapshot block, but the snapshot block is valid and its ancestors can be downloaded.

Or maybe that is wrong, but at least it's my understanding.

Copy link
Contributor

@mzumsande mzumsande Jun 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me in principle but before doing 3 ensure that the node's tip is actually the tip of the divergent chain (...)

Maybe I misunderstand, but that seems overly complicated. I assume we're talking about the scenario "Not an ancestor of the snapshot block but has less work":
In my local test, I just gave the node the headers of the snapshot chain, and then used -generate to mine a divergent chain from the old tip. Number of blocks doesn't really matter. The other chain (with the snapshot in it) will have more work, but it is headers-only, so the tip will be on the divergent chain no matter how much work it has. Then, after the snapshot is loaded and we connected to a peer that has all the blocks, the node will successfully download the snapshot chain, but currently the background sync won't complete unless you apply my fix above. I assume that @alfonsoromanz's test (not pushed yet) works in a similar way.

have suggested a fix here: #30267

Huh, I didn't see @mzumsande 's post when I wrote mine... Will look into this approach as well.

Just to avoid any confusion: There are two independent issues. My issue pops up if you don't use invalidateblock anymore, as the current version of the PR still does.

However this new approach will require merging @mzumsande proposed fix to work.

I neither want to hijack this PR nor open a PR with just the fix without a test, so my suggestion would be that you could incorporate the one-line fix into this PR (meaning that this PR wouldn't be test-only anymore) - if you're interested and have the time, that is.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Particularly "but has less work" could mean A) less work than the tip of the chain that includes the snapshot or B) less work than the snapshot block itself. I believe A is the more interesting and intended scenario because B only seems to be a state a node can be in if the snapshot block is invalid or its ancestors are unknown which should lead to a much earlier error.

I interpreted it as B) less work than the snapshot block itself.

Regarding A): If the new node (divergent chain) has less work than the chain tip (399) but more than the snapshot (299), the expected behavior would be to get the error: "Unable to Load UTXO Snapshot - [snapshot] activation failed - work does not exceed active chainstate." This same behavior is expected for the other scenario of a node with a divergent chain but with more work, i.e "TODO: Not an ancestor or a descendant of the snapshot block and has more work"

So both scenarios look very similar to me. That's why I was leaning towards option B. However, I am not an expert on real scenarios in mainnet and I may be missing something.

Also, I started working on your approach and completed this part:

  • Start with a new node under test that is not connected to the nodes that have the real chain.
  • Mine a divergent chain of the same height as the real chain.
  • Compare the total work of the real chain with the divergent chain and grind the last block(s) of the divergent chain until its tip has less total work than that of the real chain, if this is not the case right away.
  • Then connect the node under test to a node with the real chain.

But this is where I get confused:

  • The node under test should have knowledge of the real chain with more work but it should still have the divergent chain as its tip because it has seen that block first.

After following you steps and connecting and syncing the nodes, the divergent chain is replaced with the original chain because has more work. If I don't run sync, it's not replaced, but I guess it's just a matter of time? or maybe I don't understand how connect and sync works.

Given that the snapshot will not be loaded because it doesn't exceed the active chainstate, I don't see much difference in making the divergent chain have less or more work than the original chain. What am I missing?

Either way, I am happy to add tests for both A) and B) scenarios.

In my local test, I just gave the node the headers of the snapshot chain, and then used -generate to mine a divergent chain from the old tip. Number of blocks doesn't really matter. The other chain (with the snapshot in it) will have more work, but it is headers-only, so the tip will be on the divergent chain no matter how much work it has. Then, after the snapshot is loaded and we connected to a peer that has all the blocks, the node will successfully download the snapshot chain, but currently the background sync won't complete unless you apply my fix above. I assume that @alfonsoromanz's test (not pushed yet) works in a similar way.

Yes that's what I'm doing in my local code (not pushed). I'm submitting the headers to n3 just like the original code is doing to n1 and n2. After that, I call this test function test_snapshot_in_a_divergent_chain where I generate the divergent chain and load the snapshot. The background validation only finishes if I apply your fix.

I neither want to hijack this PR nor open a PR with just the fix without a test, so my suggestion would be that you could incorporate the one-line fix into this PR (meaning that this PR wouldn't be test-only anymore) - if you're interested and have the time, that is.

Yes, I can incorporate it and add you as a co-author. Thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just pushed my recent changes for this PR. This is the approach I decided to move forward with:

  1. Scenario Choice: Between these two scenarios mentioned by @fjahr: "Particularly 'but has less work' could mean A) less work than the tip of the chain that includes the snapshot or B) less work than the snapshot block itself," I chose B) for this PR: less work than the snapshot block itself. I am happy to add tests for the other scenario in a separate PR.
  2. Added Fix: I incorporated the fix from @mzumsande to start downloading historical blocks from the last common ancestor. This avoids the scenario where blocks in between are not requested, which causes the background validation to never finish. 8b6a18a
  3. New Node for "Divergent Chain with Less Work": I added a new node (n3) for the scenario where we load the snapshot in a node with a divergent chain but less work. I am not reusing previous nodes because I don't know a way to do a clean rollback without invalidating blocks. As mentioned by @fjahr, we shouldn't expect to load a snapshot in a scenario where part of the snapshot was invalidated. There is actually a new PR to prevent this from happening: assumeutxo: Check snapshot base block is not in invalid chain #30267.
  4. New Node for "Divergent Chain with More Work": I also added a new node (n4) for the scenario where we load the snapshot in a node with a divergent chain and more work. I was not able to reuse n3 for the same reason described previously: n3 has already synced to the tip, and I don't know any other way to rollback the chain other than invalidating blocks.

Any feedback is appreciated.

Thanks!

self.log.info('Importing the snapshot into n3')
loaded = n3.loadtxoutset(dump_output_path)
assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT)
assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT)

# Now lets sync the nodes and wait for the background validation to finish
self.connect_nodes(0, 3)
self.sync_blocks(nodes=(n0, n3))
print('Ensuring background validation finishes')
self.wait_until(lambda: len(n3.getchainstates()['chainstates']) == 1)

self.log.info(f"Check importing a snapshot where current chain-tip is not an ancestor or a descendant of the snapshot block and has more work")
alfonsoromanz marked this conversation as resolved.
Show resolved Hide resolved
# Generate a divergent chain in n4 that has more work than the snapshot
# This covers the scenario where the snapshot block is not on the most-work chain
self.generate(n4, nblocks=101, sync_fun=self.no_op)
assert_equal(n4.getblockcount(), SNAPSHOT_BASE_HEIGHT + 1)
# Import the snapshot and assert its failure
with n4.assert_debug_log(expected_msgs=["[snapshot] activation failed - work does not exceed active chainstate"]):
assert_raises_rpc_error(-32603, "Unable to load UTXO snapshot", n4.loadtxoutset, dump_output_path)

def run_test(self):
"""
Bring up two (disconnected) nodes, mine some new blocks on the first,
Expand All @@ -215,6 +247,8 @@ def run_test(self):
n0 = self.nodes[0]
n1 = self.nodes[1]
n2 = self.nodes[2]
n3 = self.nodes[3]
n4 = self.nodes[4]

self.mini_wallet = MiniWallet(n0)

Expand Down Expand Up @@ -265,6 +299,8 @@ def run_test(self):
# block.
n1.submitheader(block)
n2.submitheader(block)
n3.submitheader(block)
n4.submitheader(block)

# Ensure everyone is seeing the same headers.
for n in self.nodes:
Expand Down Expand Up @@ -455,7 +491,7 @@ def check_tx_counts(final: bool) -> None:

self.connect_nodes(0, 2)
self.wait_until(lambda: n2.getchainstates()['chainstates'][-1]['blocks'] == FINAL_HEIGHT)
self.sync_blocks()
self.sync_blocks(nodes=(n0, n2))

self.log.info("Ensuring background validation completes")
self.wait_until(lambda: len(n2.getchainstates()['chainstates']) == 1)
Expand Down Expand Up @@ -492,6 +528,8 @@ def check_tx_counts(final: bool) -> None:
self.connect_nodes(0, 2)
self.wait_until(lambda: n2.getblockcount() == FINAL_HEIGHT)

self.test_snapshot_in_a_divergent_chain(dump_output['path'])

@dataclass
class Block:
hash: str
Expand Down