Skip to content

Commit f7af19b

Browse files
Fix autoexpand during node replace (#98891)
Prior to this change NodeReplacementAllocationDecider was unconditionally skipping both replacement source and target nodes when calculation auto-expand replicas. This is fixed by autoexpanding to the replacement node if source node already had shards of the index Backport of PR #96281 amended for 7.17.x Closes #89527 Co-authored-by: Ievgen Degtiarenko <ievgen.degtiarenko@elastic.co>
1 parent 0df52c8 commit f7af19b

File tree

6 files changed

+537
-168
lines changed

6 files changed

+537
-168
lines changed

server/src/main/java/org/elasticsearch/cluster/node/DiscoveryNodes.java

+14
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,20 @@ public DiscoveryNode findByAddress(TransportAddress address) {
277277
return null;
278278
}
279279

280+
/**
281+
* Check if a node with provided name exists
282+
*
283+
* @return {@code true} node identified with provided name exists or {@code false} otherwise
284+
*/
285+
public boolean hasByName(String name) {
286+
for (DiscoveryNode node : nodes.values()) {
287+
if (node.getName().equals(name)) {
288+
return true;
289+
}
290+
}
291+
return false;
292+
}
293+
280294
/**
281295
* Returns the version of the node with the oldest version in the cluster that is not a client node
282296
*

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/NodeReplacementAllocationDecider.java

+59-17
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@
1818
import java.util.Map;
1919
import java.util.Optional;
2020

21+
/**
22+
* An allocation decider that ensures that all the shards allocated to the node scheduled for removal are relocated to the replacement node.
23+
* It also ensures that auto-expands replicas are expanded to only the replacement source or target (not both at the same time)
24+
* and only of the shards that were already present on the source node.
25+
*/
2126
public class NodeReplacementAllocationDecider extends AllocationDecider {
2227

2328
public static final String NAME = "node_replacement";
@@ -37,8 +42,8 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
3742
Decision.Type.YES,
3843
NAME,
3944
"node [%s] is replacing node [%s], and may receive shards from it",
40-
shardRouting.currentNodeId(),
41-
node.nodeId()
45+
node.nodeId(),
46+
shardRouting.currentNodeId()
4247
);
4348
} else if (isReplacementSource(allocation, shardRouting.currentNodeId())) {
4449
return Decision.single(
@@ -96,22 +101,54 @@ public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNod
96101
return NO_REPLACEMENTS;
97102
} else if (isReplacementTargetName(allocation, node.getName())) {
98103
final SingleNodeShutdownMetadata shutdown = allocation.replacementTargetShutdowns().get(node.getName());
99-
return Decision.single(
100-
Decision.Type.NO,
101-
NAME,
102-
"node [%s] is a node replacement target for node [%s], "
103-
+ "shards cannot auto expand to be on it until the replacement is complete",
104-
node.getId(),
105-
shutdown == null ? null : shutdown.getNodeId()
106-
);
104+
final String sourceNodeId = shutdown != null ? shutdown.getNodeId() : null;
105+
final boolean hasShardsAllocatedOnSourceOrTarget = hasShardOnNode(indexMetadata, node.getId(), allocation)
106+
|| (sourceNodeId != null && hasShardOnNode(indexMetadata, sourceNodeId, allocation));
107+
108+
if (hasShardsAllocatedOnSourceOrTarget) {
109+
return allocation.decision(
110+
Decision.YES,
111+
NAME,
112+
"node [%s] is a node replacement target for node [%s], "
113+
+ "shard can auto expand to it as it was already present on the source node",
114+
node.getId(),
115+
sourceNodeId
116+
);
117+
} else {
118+
return allocation.decision(
119+
Decision.NO,
120+
NAME,
121+
"node [%s] is a node replacement target for node [%s], "
122+
+ "shards cannot auto expand to be on it until the replacement is complete",
123+
node.getId(),
124+
sourceNodeId
125+
);
126+
}
107127
} else if (isReplacementSource(allocation, node.getId())) {
108-
return Decision.single(
109-
Decision.Type.NO,
110-
NAME,
111-
"node [%s] is being replaced by [%s], shards cannot auto expand to be on it",
112-
node.getId(),
113-
getReplacementName(allocation, node.getId())
114-
);
128+
final SingleNodeShutdownMetadata shutdown = allocation.metadata().nodeShutdowns().get(node.getId());
129+
final String replacementNodeName = shutdown != null ? shutdown.getTargetNodeName() : null;
130+
final boolean hasShardOnSource = hasShardOnNode(indexMetadata, node.getId(), allocation)
131+
&& shutdown != null
132+
&& allocation.nodes().hasByName(replacementNodeName) == false;
133+
134+
if (hasShardOnSource) {
135+
return allocation.decision(
136+
Decision.YES,
137+
NAME,
138+
"node [%s] is being replaced by [%s], shards can auto expand to be on it "
139+
+ "while replacement node has not joined the cluster",
140+
node.getId(),
141+
replacementNodeName
142+
);
143+
} else {
144+
return allocation.decision(
145+
Decision.NO,
146+
NAME,
147+
"node [%s] is being replaced by [%s], shards cannot auto expand to be on it",
148+
node.getId(),
149+
replacementNodeName
150+
);
151+
}
115152
} else {
116153
return Decision.single(
117154
Decision.Type.YES,
@@ -121,6 +158,11 @@ public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNod
121158
}
122159
}
123160

161+
private static boolean hasShardOnNode(IndexMetadata indexMetadata, String nodeId, RoutingAllocation allocation) {
162+
RoutingNode node = allocation.routingNodes().node(nodeId);
163+
return node != null && node.numberOfOwningShardsForIndex(indexMetadata.getIndex()) >= 1;
164+
}
165+
124166
@Override
125167
public Decision canForceAllocateDuringReplace(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
126168
if (replacementFromSourceToTarget(allocation, shardRouting.currentNodeId(), node.node().getName())) {

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDecider.java

+10
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,16 @@ public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNod
100100
node.getId()
101101
);
102102
case REPLACE:
103+
if (allocation.nodes().hasByName(thisNodeShutdownMetadata.getTargetNodeName()) == false) {
104+
return allocation.decision(
105+
Decision.YES,
106+
NAME,
107+
"node [%s] is preparing to be removed from the cluster, but replacement is not yet present",
108+
node.getId()
109+
);
110+
} else {
111+
return allocation.decision(Decision.NO, NAME, "node [%s] is preparing for removal from the cluster", node.getId());
112+
}
103113
case REMOVE:
104114
return allocation.decision(Decision.NO, NAME, "node [%s] is preparing for removal from the cluster", node.getId());
105115
default:

0 commit comments

Comments
 (0)