Merge #142239 #143127

craig[bot] · hakuuww · stevendanna · craig[bot] · commit 06e0523f3c7a · 2025-03-25T13:44:40.000Z
142239: raft: introduce term cache r=pav-kv,tbg a=hakuuww This PR introduces a new sub data structure `termCache` to `raftLog`, which stores a suffix of the `raftLog` in a compressed representation, and helps getting a term of a particular raft entry. `termCache` integrates tightly with `raftLog`, which means `termCache` replies on many assumptions guaranteed by `raftLog`, allowing concise implementation. --- First of all, this is an example of what a raftLog may look like: entryID: term/index `[t5/10, t5/11, t5/12, t5/13, t6/14, t6/15, t6/16, t6/17, t6/18, t6/19, t7/20, t7/21, t7/22, t7/23, t7/24, t7/25, t7/26, t7/27, t8/28, t8/29, t8/30, t8/31, t10/32, t10/33, t10/34]` properties of a raftLog: entryID.Index are strictly increasing and continuous. entryID.term are monotonically increasing, and can have gaps in between. --- Those properties allow us to use term change points to express a long, continous raftLog. The above example raftLog can be expressed as the following in our `termCache` representation: (each entry is a term change point) `[t5/10, t6/14, t7/20, t8/28, t10/32]` In practice, a raftLog may be hundreds of entries long, but with only a few term changes in between. So this compressed representation allows us to represent a long raftLog's entryIDs cheaply. --- One immediate benefit of doing so is that there should no longer be any [raftEntry cache accesses or pebble calls when we want to know the term of a storage persisted entry](https://github.com/cockroachdb/cockroach/blob/e587879be8cd0f1ace03952decf6dda2573f0b56/pkg/kv/kvserver/logstore/logstore.go#L614-L639). (this is assuming term flips are rare, we can still have pebble access if we want to know the term of a very early entry that is more than `termCacheSize` terms old). This helps avoid: - unhelpful evictions on the raftEntry cache - pebble access Currently, both of the above scenarios doesn't incur a big cost, but we can still save a few --- A second benefit is that: since we now keep a compressed representation of suffix of a raftLog, we can use this to carry more information in the raft leader probing follower process. Currently, a raft message MsgAppResp{reject = true} from the follower only carries a single hintIndex and hintTerm. With the term cache, we can include more information about the raftLog of a follower in its MsgAppResp with relatively low overhead. Which can be used to reduce the rtt involved in the leader/follower probing process. Assuming we keep a few(say 4) term change points in the 'termCache', we can attach all 4 of those data points into our raft RPC messages. Which should be enough to cover the whole raftLog of a raft node. The term cache covers entryIDs in the following range: `[raftLog.first, raftLog.last]` or something like: `[entryID at commited index, raftLog.lastIndex]` (in real implementation we also need to attach a lastIndex, which the term cache doesn't keep, but is kept in unstable/raftLog) When receiving this `termCache` information from a `MsgAppResp{reject=true}` or `MsgVoteResp`, the leader can immediately know the accurate fork point of where to send the next MsgApp. instead of doing a few probing rtts to find the fork point. (our current probing algorithm may take 2-3 rtts between Leader and follower to find a fork point in a bad raft case involving multiple leadership changes and partitions) Part of #136296 Epic: None Release note: None 143127: kvserver: add per-operation lock reliability settings r=yuzefovich a=stevendanna Preserving unreplicated locks during split, merge, and lease transfers have different trade offs. For instance, during a split all lock updates are done in memory without any new replicated writes, whereas for merge and lease transfers requiring replicating locks through raft. Here, we put the different operations under different settings since we may want to ship different defaults for the different operations. Epic: none Release note: None Co-authored-by: Anthony Xu <anthony.xu@cockroachlabs.com> Co-authored-by: Steven Danna <danna@cockroachlabs.com>
diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go
@@ -5781,7 +5781,7 @@ func TestLeaseTransferReplicatesLocks(t *testing.T) {
 	// txn2 is never unblocked (from the perspective of the client).
 	ctx := context.Background()
 	st := cluster.MakeClusterSettings()
-	concurrency.UnreplicatedLockReliability.Override(ctx, &st.SV, true)
+	concurrency.UnreplicatedLockReliabilityLeaseTransfer.Override(ctx, &st.SV, true)
 	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
 		ServerArgs: base.TestServerArgs{
 			Settings: st,
@@ -5891,7 +5891,7 @@ func TestMergeReplicatesLocks(t *testing.T) {
 		ctx = context.Background()
 		st  = cluster.MakeClusterSettings()
 	)
-	concurrency.UnreplicatedLockReliability.Override(ctx, &st.SV, true)
+	concurrency.UnreplicatedLockReliabilityMerge.Override(ctx, &st.SV, true)
 
 	for _, b := range []bool{true, false} {
 		name := "lhs-lock"
diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go
@@ -114,13 +114,31 @@ var BatchPushedLockResolution = settings.RegisterBoolSetting(
 	true,
 )
 
-// UnreplicatedLockReliability controls whether the replica will attempt
-// to keep unreplicated locks during node operations such as split.
-var UnreplicatedLockReliability = settings.RegisterBoolSetting(
+// UnreplicatedLockReliabilitySplit controls whether the replica will attempt
+// to keep unreplicated locks during range split operations.
+var UnreplicatedLockReliabilitySplit = settings.RegisterBoolSetting(
 	settings.SystemOnly,
-	"kv.lock_table.unreplicated_lock_reliability.enabled",
-	"whether the replica should attempt to keep unreplicated locks during various node operations",
-	metamorphic.ConstantWithTestBool("kv.lock_table.unreplicated_lock_reliability.enabled", true),
+	"kv.lock_table.unreplicated_lock_reliability.split.enabled",
+	"whether the replica should attempt to keep unreplicated locks during range splits",
+	metamorphic.ConstantWithTestBool("kv.lock_table.unreplicated_lock_reliability.split.enabled", true),
+)
+
+// UnreplicatedLockReliabilityLeaseTransfer controls whether the replica will attempt
+// to keep unreplicated locks during lease transfer operations.
+var UnreplicatedLockReliabilityLeaseTransfer = settings.RegisterBoolSetting(
+	settings.SystemOnly,
+	"kv.lock_table.unreplicated_lock_reliability.lease_transfer.enabled",
+	"whether the replica should attempt to keep unreplicated locks during lease transfers",
+	metamorphic.ConstantWithTestBool("kv.lock_table.unreplicated_lock_reliability.lease_transfer.enabled", true),
+)
+
+// UnreplicatedLockReliabilityMerge controls whether the replica will
+// attempt to keep unreplicated locks during range merge operations.
+var UnreplicatedLockReliabilityMerge = settings.RegisterBoolSetting(
+	settings.SystemOnly,
+	"kv.lock_table.unreplicated_lock_reliability.merge.enabled",
+	"whether the replica should attempt to keep unreplicated locks during range merges",
+	metamorphic.ConstantWithTestBool("kv.lock_table.unreplicated_lock_reliability.merge.enabled", true),
 )
 
 // managerImpl implements the Manager interface.
@@ -590,7 +608,7 @@ var allKeysSpan = roachpb.Span{Key: keys.MinKey, EndKey: keys.MaxKey}
 
 // OnRangeLeaseTransferEval implements the RangeStateListener interface.
 func (m *managerImpl) OnRangeLeaseTransferEval() []*roachpb.LockAcquisition {
-	if !UnreplicatedLockReliability.Get(&m.st.SV) {
+	if !UnreplicatedLockReliabilityLeaseTransfer.Get(&m.st.SV) {
 		return nil
 	}
 
@@ -606,7 +624,7 @@ func (m *managerImpl) OnRangeLeaseTransferEval() []*roachpb.LockAcquisition {
 // during evalutation of Subsume. The returned LockAcquisition structs represent
 // held locks that we may want to flush to disk as replicated.
 func (m *managerImpl) OnRangeSubsumeEval() []*roachpb.LockAcquisition {
-	if !UnreplicatedLockReliability.Get(&m.st.SV) {
+	if !UnreplicatedLockReliabilityMerge.Get(&m.st.SV) {
 		return nil
 	}
 
@@ -636,7 +654,7 @@ func (m *managerImpl) OnRangeLeaseUpdated(seq roachpb.LeaseSequence, isLeasehold
 // LHS replica of a split and should be passed the new RHS start key (LHS
 // EndKey).
 func (m *managerImpl) OnRangeSplit(rhsStartKey roachpb.Key) []roachpb.LockAcquisition {
-	if UnreplicatedLockReliability.Get(&m.st.SV) {
+	if UnreplicatedLockReliabilitySplit.Get(&m.st.SV) {
 		lockToMove := m.lt.ClearGE(rhsStartKey)
 		m.twq.ClearGE(rhsStartKey)
 		return lockToMove
diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager_test.go b/pkg/kv/kvserver/concurrency/concurrency_manager_test.go
@@ -735,7 +735,9 @@ func newClusterWithSettings(st *clustersettings.Settings) *cluster {
 	// Set the latch manager's long latch threshold to infinity to disable
 	// logging, which could cause a test to erroneously fail.
 	spanlatch.LongLatchHoldThreshold.Override(context.Background(), &st.SV, math.MaxInt64)
-	concurrency.UnreplicatedLockReliability.Override(context.Background(), &st.SV, true)
+	concurrency.UnreplicatedLockReliabilitySplit.Override(context.Background(), &st.SV, true)
+	concurrency.UnreplicatedLockReliabilityMerge.Override(context.Background(), &st.SV, true)
+	concurrency.UnreplicatedLockReliabilityLeaseTransfer.Override(context.Background(), &st.SV, true)
 	manual := timeutil.NewManualTime(timeutil.Unix(123, 0))
 	return &cluster{
 		nodeDesc:  &roachpb.NodeDescriptor{NodeID: 1},
diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go
@@ -915,7 +915,7 @@ func (r *Replica) AdminMerge(
 		// This must be a single request in a BatchRequest: there are multiple
 		// places that do special logic (needed for safety) that rely on
 		// BatchRequest.IsSingleSubsumeRequest() returning true.
-		shouldPreserveLocks := concurrency.UnreplicatedLockReliability.Get(&r.ClusterSettings().SV)
+		shouldPreserveLocks := concurrency.UnreplicatedLockReliabilityMerge.Get(&r.ClusterSettings().SV)
 		br, pErr := kv.SendWrapped(ctx, r.store.DB().NonTransactionalSender(),
 			&kvpb.SubsumeRequest{
 				RequestHeader: kvpb.RequestHeader{
diff --git a/pkg/raft/BUILD.bazel b/pkg/raft/BUILD.bazel
@@ -13,6 +13,7 @@ go_library(
         "rawnode.go",
         "status.go",
         "storage.go",
+        "term_cache.go",
         "testing_knobs.go",
         "types.go",
         "util.go",
@@ -50,6 +51,7 @@ go_test(
         "raft_test.go",
         "rawnode_test.go",
         "storage_test.go",
+        "term_cache_test.go",
         "types_test.go",
         "util_test.go",
     ],
diff --git a/pkg/raft/log.go b/pkg/raft/log.go
@@ -42,10 +42,16 @@ type LogSnapshot struct {
 	storage LogStorage
 	// unstable contains the unstable log entries.
 	unstable LeadSlice
+	// termCache contains a compressed entryID suffix of raftLog.
+	termCache termCache
 	// logger gives access to logging errors.
 	logger raftlogger.Logger
 }
 
+// termCacheSize is the default max size of the termCache. It is small because
+// term flips are very rare in practice.
+const termCacheSize = 4
+
 type raftLog struct {
 	// storage contains all stable entries since the last snapshot.
 	storage Storage
@@ -54,6 +60,10 @@ type raftLog struct {
 	// they will be saved into storage.
 	unstable unstable
 
+	// termCache contains a suffix of the raftLog (both stable and unstable)
+	// used for term lookup.
+	termCache termCache
+
 	// committed is the highest log position that is known to be in
 	// stable storage on a quorum of nodes.
 	committed uint64
@@ -108,6 +118,7 @@ func newLogWithSize(
 	return &raftLog{
 		storage:             storage,
 		unstable:            newUnstable(last, logger),
+		termCache:           newTermCache(termCacheSize, last),
 		maxApplyingEntsSize: maxApplyingEntsSize,
 
 		// Initialize our committed and applied pointers to the time of the last
@@ -177,15 +188,23 @@ func (l *raftLog) maybeAppend(a LeadSlice) bool {
 	if first := a.entries[0].Index; first <= l.committed {
 		l.logger.Panicf("entry %d is already committed [committed(%d)]", first, l.committed)
 	}
-	return l.unstable.truncateAndAppend(a)
+	if !l.unstable.truncateAndAppend(a) {
+		return false
+	}
+	l.termCache.truncateAndAppend(a.LogSlice)
+	return true
 }
 
 // append adds the given log slice to the end of the log.
 //
 // Returns false if the operation can not be done: entry a.prev does not match
 // the lastEntryID of this log, or a.term is outdated.
 func (l *raftLog) append(a LeadSlice) bool {
-	return l.unstable.append(a)
+	if l.unstable.append(a) {
+		l.termCache.truncateAndAppend(a.LogSlice)
+		return true
+	}
+	return false
 }
 
 // match finds the longest prefix of the given log slice that matches the log.
@@ -449,6 +468,9 @@ func (l LogSnapshot) term(index uint64) (uint64, error) {
 		return 0, ErrCompacted
 	}
 
+	if term, found := l.termCache.term(index); found {
+		return term, nil
+	}
 	term, err := l.storage.Term(index)
 	if err == nil {
 		return term, nil
@@ -516,6 +538,7 @@ func (l *raftLog) restore(s snapshot) bool {
 	if !l.unstable.restore(s) {
 		return false
 	}
+	l.termCache.reset(id)
 	l.committed = id.index
 	return true
 }
@@ -667,10 +690,13 @@ func (l *raftLog) zeroTermOnOutOfBounds(t uint64, err error) uint64 {
 // snap returns a point-in-time snapshot of the raft log. This snapshot can be
 // read from while the underlying storage is not mutated.
 func (l *raftLog) snap(storage LogStorage) LogSnapshot {
+	// NB: termCache and unstable slice are safe to copy, and make sure to not
+	// corrupt their shallow copies.
 	return LogSnapshot{
 		compacted: l.compacted(),
 		storage:   storage,
 		unstable:  l.unstable.LeadSlice,
+		termCache: l.termCache,
 		logger:    l.logger,
 	}
 }
diff --git a/pkg/raft/log_test.go b/pkg/raft/log_test.go
@@ -586,6 +586,76 @@ func TestStableTo(t *testing.T) {
 	}
 }
 
+// TestTermCacheLookUpAfterStableTo tests the term cache lookup after we have
+// persisted entries from unstable to stable. The test asserts that the term
+// cache is used for lookups when possible and that the storage is not accessed.
+func TestTermCacheLookUpAfterStableTo(t *testing.T) {
+	for _, tt := range []struct {
+		init          LeadSlice
+		stableTo      LogMark
+		wantTermCalls int // the expected number of LogStorage.Term() calls
+	}{{
+		init:          entryID{}.append(1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5),
+		stableTo:      LogMark{Term: 5, Index: 10},
+		wantTermCalls: 2, // indices 0-1
+	}, {
+		init:          entryID{}.append(1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9),
+		stableTo:      LogMark{Term: 9, Index: 10},
+		wantTermCalls: 6, // indices 0-5
+	}, {
+		init:          entryID{}.append(1, 2, 3),
+		stableTo:      LogMark{Term: 3, Index: 2},
+		wantTermCalls: 0,
+	}, {
+		init:          entryID{}.append(1, 2, 3, 4),
+		stableTo:      LogMark{Term: 4, Index: 3},
+		wantTermCalls: 1, // index 0
+	}, {
+		init:          entryID{}.append(1, 2, 3, 4, 5),
+		stableTo:      LogMark{Term: 5, Index: 4},
+		wantTermCalls: 2, // indices 0-1
+	}, {
+		init:          entryID{}.append(1, 2, 3, 4, 5, 5, 5, 5, 5),
+		stableTo:      LogMark{Term: 5, Index: 7},
+		wantTermCalls: 2, // indices 0-1
+	}, {
+		init:          entryID{}.append(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
+		stableTo:      LogMark{Term: 3, Index: 10},
+		wantTermCalls: 0,
+	}, {
+		init:          entryID{index: 10, term: 4}.append(5, 5, 6, 6, 7, 7, 8, 8, 9, 9),
+		stableTo:      LogMark{Term: 9, Index: 16},
+		wantTermCalls: 3, // indices 10-12
+	}} {
+		t.Run("", func(t *testing.T) {
+			// Initialize the log storage to a particular truncated state.
+			storage := NewMemoryStorage()
+			initID := tt.init.prev
+			if initID.index != 0 {
+				require.NoError(t, storage.ApplySnapshot(pb.Snapshot{Metadata: pb.SnapshotMetadata{
+					Index: initID.index, Term: initID.term,
+				}}))
+			}
+			// Initialize the raft log from the storage.
+			raftLog := newLog(storage, raftlogger.DiscardLogger)
+			// Manually hardcode the term cache size for testing.
+			raftLog.termCache.maxSize = 4
+			require.True(t, raftLog.append(tt.init))
+			// Imitate a transfer of some unstable entries into storage.
+			require.NoError(t, storage.Append(tt.init.sub(initID.index, tt.stableTo.Index)))
+			raftLog.stableTo(tt.stableTo)
+
+			// Do term lookup for the parts of raftLog not covered by unstable.
+			start, end := tt.init.LogSlice.prev.index, raftLog.unstable.prev.index
+			for i := start; i <= end; i++ {
+				_, err := raftLog.term(i)
+				require.NoError(t, err)
+			}
+			require.Equal(t, tt.wantTermCalls, storage.callStats.term-1)
+		})
+	}
+}
+
 func TestStableToWithSnap(t *testing.T) {
 	snapID := entryID{term: 2, index: 5}
 	snap := pb.Snapshot{Metadata: pb.SnapshotMetadata{Term: snapID.term, Index: snapID.index}}
@@ -674,6 +744,9 @@ func TestLogRestore(t *testing.T) {
 	require.Equal(t, index, raftLog.committed)
 	require.Equal(t, index, raftLog.unstable.prev.index)
 	require.Equal(t, term, mustTerm(raftLog.term(index)))
+	// Term cache should be re-initialized to only have the snapshot entryID.
+	require.Equal(t, entryID{index: index, term: term}, raftLog.termCache.first())
+	require.Equal(t, 1, len(raftLog.termCache.cache))
 }
 
 func TestIsOutOfBounds(t *testing.T) {
diff --git a/pkg/raft/storage.go b/pkg/raft/storage.go
@@ -334,6 +334,7 @@ func MakeLogSnapshot(ms *MemoryStorage) LogSnapshot {
 		compacted: ms.Compacted(),
 		storage:   ms.LogSnapshot(),
 		unstable:  LeadSlice{term: ls.lastEntryID().term, LogSlice: ls},
+		termCache: newTermCache(1, ms.ls.lastEntryID()),
 		logger:    raftlogger.DiscardLogger,
 	}
 }
diff --git a/pkg/raft/term_cache.go b/pkg/raft/term_cache.go
diff --git a/pkg/raft/term_cache_test.go b/pkg/raft/term_cache_test.go

Original file line number	Diff line number	Diff line change
`@@ -334,6 +334,7 @@ func MakeLogSnapshot(ms *MemoryStorage) LogSnapshot {`
`334`	`334`	`compacted: ms.Compacted(),`
`335`	`335`	`storage: ms.LogSnapshot(),`
`336`	`336`	`unstable: LeadSlice{term: ls.lastEntryID().term, LogSlice: ls},`
	`337`	`+ termCache: newTermCache(1, ms.ls.lastEntryID()),`
`337`	`338`	`logger: raftlogger.DiscardLogger,`
`338`	`339`	`}`
`339`	`340`	`}`