fix(tests): remove sync_empty_state (#12473)

This test can fail with nightly features and block PR merges: https://nayduck.nearone.org/#/test/231517. The reason it fails is that it sets up a node with a real rocksdb database in a temp dir, and creating snapshots actually takes several seconds even with small state because [this loop](https://github.com/near/nearcore/blob/8e0b26f161b43ef03e41342cd08e1363dfa23db9/core/store/src/db/rocksdb.rs#L475) takes a while. So the node trying to state sync can't request parts from the other node because it doesn't have a snapshot yet, and it doesn't retry its request until long after the current epoch is over and the other node has already deleted that snapshot. The fact that that loop in `create_checkpoint()` takes so long is something that perhaps we could look into, but it's not an issue in practice with long epoch lengths, so it's not a super high priority thing that should be causing test failures. Fix it by just deleting that test and modifying the test loop state sync test to get the same conditions where we have fewer accounts than shards so that one of them is empty. (Although it should be noted that with nightly features, the state won't actually be empty since there'll be `BandwidthSchedulerState` data in the shards without accounts)
near · Nov 18, 2024 · 7dfdfb0 · 7dfdfb0
1 parent 1a3382c
commit 7dfdfb0
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 151 deletions.
diff --git a/integration-tests/src/test_loop/tests/state_sync.rs b/integration-tests/src/test_loop/tests/state_sync.rs
@@ -19,14 +19,7 @@ use std::collections::HashMap;
 
 const EPOCH_LENGTH: BlockHeightDelta = 40;
 
-struct ShardAccounts {
-    boundary_accounts: Vec<String>,
-    accounts: Vec<Vec<(AccountId, Nonce)>>,
-}
-
-fn generate_accounts(num_shards: usize) -> ShardAccounts {
-    let accounts_per_shard = 5;
-
+fn get_boundary_accounts(num_shards: usize) -> Vec<String> {
     if num_shards > 27 {
         todo!("don't know how to include more than 27 shards yet!");
     }
@@ -41,7 +34,11 @@ fn generate_accounts(num_shards: usize) -> ShardAccounts {
         }
         boundary_accounts.push(boundary_account);
     }
+    boundary_accounts
+}
 
+fn generate_accounts(boundary_accounts: &[String]) -> Vec<Vec<(AccountId, Nonce)>> {
+    let accounts_per_shard = 5;
     let mut accounts = Vec::new();
     let mut account_base = "0";
     for a in boundary_accounts.iter() {
@@ -58,23 +55,24 @@ fn generate_accounts(num_shards: usize) -> ShardAccounts {
             .collect::<Vec<_>>(),
     );
 
-    ShardAccounts { boundary_accounts, accounts }
+    accounts
 }
 
 struct TestState {
     env: TestLoopEnv,
-    accounts: Vec<Vec<(AccountId, Nonce)>>,
+    accounts: Option<Vec<Vec<(AccountId, Nonce)>>>,
 }
 
 fn setup_initial_blockchain(
+    num_validators: usize,
     num_shards: usize,
+    generate_shard_accounts: bool,
     chunks_produced: HashMap<ShardId, Vec<bool>>,
 ) -> TestState {
     let builder = TestLoopBuilder::new();
 
     let num_block_producer_seats = 1;
     let num_chunk_producer_seats = num_shards;
-    let num_validators = std::cmp::max(num_block_producer_seats, num_chunk_producer_seats);
     let validators = (0..num_validators)
         .map(|i| {
             let account_id = format!("node{}", i);
@@ -88,7 +86,9 @@ fn setup_initial_blockchain(
         .collect::<Vec<_>>();
     let clients = validators.iter().map(|v| v.account_id.clone()).collect::<Vec<_>>();
 
-    let ShardAccounts { boundary_accounts, accounts } = generate_accounts(num_shards);
+    let boundary_accounts = get_boundary_accounts(num_shards);
+    let accounts =
+        if generate_shard_accounts { Some(generate_accounts(&boundary_accounts)) } else { None };
 
     let mut genesis_builder = TestGenesisBuilder::new();
     genesis_builder
@@ -111,9 +111,11 @@ fn setup_initial_blockchain(
         // This part is the only reference to state sync at all in this test, since all we check is that the blockchain
         // progresses for a few epochs, meaning that state sync must have been successful.
         .shuffle_shard_assignment_for_chunk_producers(true);
-    for accounts in accounts.iter() {
-        for (account, _nonce) in accounts.iter() {
-            genesis_builder.add_user_account_simple(account.clone(), 10000 * ONE_NEAR);
+    if let Some(accounts) = accounts.as_ref() {
+        for accounts in accounts.iter() {
+            for (account, _nonce) in accounts.iter() {
+                genesis_builder.add_user_account_simple(account.clone(), 10000 * ONE_NEAR);
+            }
         }
     }
     let (genesis, epoch_config_store) = genesis_builder.build();
@@ -198,7 +200,7 @@ fn send_txs_between_shards(
 /// runs the network and sends transactions at the beginning of each epoch. At the end the condition we're
 /// looking for is just that a few epochs have passed, because that should only be possible if state sync was successful
 /// (which will be required because we enable chunk producer shard shuffling on this chain)
-fn produce_chunks(env: &mut TestLoopEnv, mut accounts: Vec<Vec<(AccountId, Nonce)>>) {
+fn produce_chunks(env: &mut TestLoopEnv, mut accounts: Option<Vec<Vec<(AccountId, Nonce)>>>) {
     let handle = env.datas[0].client_sender.actor_handle();
     let client = &env.test_loop.data.get(&handle).client;
     let mut tip = client.chain.head().unwrap();
@@ -228,7 +230,9 @@ fn produce_chunks(env: &mut TestLoopEnv, mut accounts: Vec<Vec<(AccountId, Nonce
             if epoch_id_switches > 2 {
                 break;
             }
-            send_txs_between_shards(&mut env.test_loop, &env.datas, &mut accounts);
+            if let Some(accounts) = accounts.as_mut() {
+                send_txs_between_shards(&mut env.test_loop, &env.datas, accounts);
+            }
         }
         tip = new_tip;
     }
@@ -242,7 +246,9 @@ fn run_test(state: TestState) {
         * u32::try_from(EPOCH_LENGTH).unwrap_or(u32::MAX)
         + Duration::seconds(2);
 
-    send_txs_between_shards(&mut env.test_loop, &env.datas, &mut accounts);
+    if let Some(accounts) = accounts.as_mut() {
+        send_txs_between_shards(&mut env.test_loop, &env.datas, accounts);
+    }
 
     env.test_loop.run_until(
         |data| {
@@ -260,17 +266,42 @@ fn run_test(state: TestState) {
 
 #[derive(Debug)]
 struct StateSyncTest {
+    num_validators: usize,
     num_shards: usize,
+    // If true, generate several extra accounts per shard. We have a test with this disabled
+    // to test state syncing shards without any account data
+    generate_shard_accounts: bool,
     chunks_produced: &'static [(ShardId, &'static [bool])],
 }
 
 static TEST_CASES: &[StateSyncTest] = &[
     // The first two make no modifications to chunks_produced, and all chunks should be produced. This is the normal case
-    StateSyncTest { num_shards: 2, chunks_produced: &[] },
-    StateSyncTest { num_shards: 4, chunks_produced: &[] },
+    StateSyncTest {
+        num_validators: 2,
+        num_shards: 2,
+        generate_shard_accounts: true,
+        chunks_produced: &[],
+    },
+    StateSyncTest {
+        num_validators: 4,
+        num_shards: 4,
+        generate_shard_accounts: true,
+        chunks_produced: &[],
+    },
+    // In this test we have 2 validators and 4 shards, and we don't generate any extra accounts.
+    // That makes 3 accounts ncluding the "near" account. This means at least one shard will have no
+    // accounts in it, so we check that corner case here.
+    StateSyncTest {
+        num_validators: 2,
+        num_shards: 4,
+        generate_shard_accounts: false,
+        chunks_produced: &[],
+    },
     // Now we miss some chunks at the beginning of the epoch
     StateSyncTest {
+        num_validators: 4,
         num_shards: 4,
+        generate_shard_accounts: true,
         chunks_produced: &[
             (ShardId::new(0), &[false]),
             (ShardId::new(1), &[true]),
@@ -279,11 +310,15 @@ static TEST_CASES: &[StateSyncTest] = &[
         ],
     },
     StateSyncTest {
+        num_validators: 4,
         num_shards: 4,
+        generate_shard_accounts: true,
         chunks_produced: &[(ShardId::new(0), &[true, false]), (ShardId::new(1), &[true, false])],
     },
     StateSyncTest {
+        num_validators: 4,
         num_shards: 4,
+        generate_shard_accounts: true,
         chunks_produced: &[
             (ShardId::new(0), &[false, true]),
             (ShardId::new(2), &[true, false, true]),
@@ -298,7 +333,9 @@ fn slow_test_state_sync_current_epoch() {
     for t in TEST_CASES.iter() {
         tracing::info!("run test: {:?}", t);
         let state = setup_initial_blockchain(
+            t.num_validators,
             t.num_shards,
+            t.generate_shard_accounts,
             t.chunks_produced
                 .iter()
                 .map(|(shard_id, produced)| (*shard_id, produced.to_vec()))
@@ -357,7 +394,7 @@ fn spam_state_sync_header_reqs(env: &mut TestLoopEnv) {
 fn slow_test_state_request() {
     init_test_logger();
 
-    let TestState { mut env, .. } = setup_initial_blockchain(4, HashMap::default());
+    let TestState { mut env, .. } = setup_initial_blockchain(4, 4, false, HashMap::default());
 
     spam_state_sync_header_reqs(&mut env);
     env.shutdown_and_drain_remaining_events(Duration::seconds(3));

diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs
@@ -292,134 +292,6 @@ fn ultra_slow_test_sync_state_nodes_multishard() {
     });
 }
 
-/// Start a validator that validators four shards. Since we only have 3 accounts one shard must have
-/// empty state. Start another node that does state sync. Check state sync on empty state works.
-#[test]
-fn ultra_slow_test_sync_empty_state() {
-    heavy_test(|| {
-        init_integration_logger();
-
-        let mut genesis = Genesis::test_sharded_new_version(
-            vec!["test1".parse().unwrap(), "test2".parse().unwrap()],
-            1,
-            vec![1, 1, 1, 1],
-        );
-        genesis.config.epoch_length = 20;
-
-        let _dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap());
-        let dir1 = _dir1.clone();
-        let _dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap());
-        let dir2 = _dir2.clone();
-
-        run_actix(async move {
-            let (port1, port2) =
-                (tcp::ListenerAddr::reserve_for_test(), tcp::ListenerAddr::reserve_for_test());
-
-            // State sync triggers when header head is two epochs in the future.
-            // Produce more blocks to make sure that state sync gets triggered when the second node starts.
-            let state_sync_horizon = 10;
-            let block_header_fetch_horizon = 1;
-            let block_fetch_horizon = 1;
-
-            let mut near1 = load_test_config("test1", port1, genesis.clone());
-            near1.client_config.min_num_peers = 0;
-            near1.client_config.min_block_production_delay = Duration::milliseconds(200);
-            near1.client_config.max_block_production_delay = Duration::milliseconds(400);
-
-            let nearcore::NearNode { view_client: view_client1, .. } =
-                start_with_config(dir1.path(), near1).expect("start_with_config");
-
-            let view_client2_holder = Arc::new(RwLock::new(None));
-            let arbiters_holder = Arc::new(RwLock::new(vec![]));
-            let arbiters_holder2 = arbiters_holder;
-
-            WaitOrTimeoutActor::new(
-                Box::new(move |_ctx| {
-                    if view_client2_holder.read().unwrap().is_none() {
-                        let view_client2_holder2 = view_client2_holder.clone();
-                        let arbiters_holder2 = arbiters_holder2.clone();
-                        let genesis2 = genesis.clone();
-                        let dir2 = dir2.clone();
-
-                        let actor = view_client1.send(GetBlock::latest().with_span_context());
-                        let actor = actor.then(move |res| {
-                            match &res {
-                                Ok(Ok(b)) if b.header.height >= state_sync_horizon + 1 => {
-                                    let mut view_client2_holder2 =
-                                        view_client2_holder2.write().unwrap();
-                                    let mut arbiters_holder2 = arbiters_holder2.write().unwrap();
-
-                                    if view_client2_holder2.is_none() {
-                                        let mut near2 = load_test_config("test2", port2, genesis2);
-                                        near2.network_config.peer_store.boot_nodes =
-                                            convert_boot_nodes(vec![("test1", *port1)]);
-                                        near2.client_config.min_num_peers = 1;
-                                        near2.client_config.min_block_production_delay =
-                                            Duration::milliseconds(200);
-                                        near2.client_config.max_block_production_delay =
-                                            Duration::milliseconds(400);
-                                        near2.client_config.block_header_fetch_horizon =
-                                            block_header_fetch_horizon;
-                                        near2.client_config.block_fetch_horizon =
-                                            block_fetch_horizon;
-                                        near2.client_config.tracked_shards = vec![ShardId::new(0)]; // Track all shards.
-
-                                        let nearcore::NearNode {
-                                            view_client: view_client2,
-                                            arbiters,
-                                            ..
-                                        } = start_with_config(dir2.path(), near2)
-                                            .expect("start_with_config");
-                                        *view_client2_holder2 = Some(view_client2);
-                                        *arbiters_holder2 = arbiters;
-                                    }
-                                }
-                                Ok(Ok(b)) if b.header.height <= state_sync_horizon => {
-                                    println!("FIRST STAGE {}", b.header.height)
-                                }
-                                Err(_) => return future::ready(()),
-                                _ => {}
-                            };
-                            future::ready(())
-                        });
-                        actix::spawn(actor);
-                    }
-
-                    if let Some(view_client2) = &*view_client2_holder.write().unwrap() {
-                        let actor = view_client2.send(GetBlock::latest().with_span_context());
-                        let actor = actor.then(|res| {
-                            match &res {
-                                Ok(Ok(b)) if b.header.height >= 40 => System::current().stop(),
-                                Ok(Ok(b)) if b.header.height < 40 => {
-                                    println!("SECOND STAGE {}", b.header.height)
-                                }
-                                Ok(Err(e)) => {
-                                    println!("SECOND STAGE ERROR1: {:?}", e);
-                                    return future::ready(());
-                                }
-                                Err(e) => {
-                                    println!("SECOND STAGE ERROR2: {:?}", e);
-                                    return future::ready(());
-                                }
-                                _ => {
-                                    assert!(false);
-                                }
-                            };
-                            future::ready(())
-                        });
-                        actix::spawn(actor);
-                    }
-                }),
-                100,
-                600000,
-            )
-            .start();
-        });
-        drop(_dir1);
-        drop(_dir2);
-    });
-}
-
 #[test]
 // FIXME(#9650): locks should not be held across await points, allowed currently only because the
 // lint started triggering during a toolchain bump.

diff --git a/nightly/expensive.txt b/nightly/expensive.txt
@@ -154,8 +154,6 @@ expensive integration-tests integration_tests tests::client::process_blocks::ult
 expensive integration-tests integration_tests tests::client::process_blocks::ultra_slow_test_gc_after_state_sync --features nightly
 expensive integration-tests integration_tests tests::client::process_blocks::ultra_slow_test_process_block_after_state_sync
 expensive integration-tests integration_tests tests::client::process_blocks::ultra_slow_test_process_block_after_state_sync --features nightly
-expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_empty_state
-expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_empty_state --features nightly
 expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_state_dump
 expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_state_dump --features nightly
 expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_state_nodes