From 47b120cc7f5113c2ff51b785759b8962cddf283f Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 8 Mar 2023 15:53:53 -0600 Subject: [PATCH] Add in a GpuOOM exception so running out of GPU memory is not a fatal to Spark (#995) Signed-off-by: Robert (Bobby) Evans --- .../com/nvidia/spark/rapids/jni/GpuOOM.java | 32 +++++++++++++++++++ .../com/nvidia/spark/rapids/jni/RetryOOM.java | 2 +- .../spark/rapids/jni/SplitAndRetryOOM.java | 2 +- 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/GpuOOM.java diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuOOM.java new file mode 100644 index 00000000000..0676c828855 --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuOOM.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +/** + * A special version of an out of memory error that indicates we ran out of GPU memory. This is + * mostly to avoid a fatal error that would force the worker process to restart. This should be + * recoverable on the GPU. + */ +public class GpuOOM extends RuntimeException { + public GpuOOM() { + super(); + } + + public GpuOOM(String message) { + super(message); + } +} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java index 409fa8f20c3..62d5e28cca0 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/RetryOOM.java @@ -20,7 +20,7 @@ * A special version of an out of memory error that indicates we ran out of memory, but should * roll back to a point when all memory for the task is spillable and then retry the operation. */ -public class RetryOOM extends OutOfMemoryError { +public class RetryOOM extends GpuOOM { public RetryOOM() { super(); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java b/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java index 7a54d415cfc..022c6952a11 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/SplitAndRetryOOM.java @@ -21,7 +21,7 @@ * roll back to a point when all memory for the task is spillable and then retry the operation * with the input data split to make it ideally use less GPU memory overall. */ -public class SplitAndRetryOOM extends OutOfMemoryError { +public class SplitAndRetryOOM extends GpuOOM { public SplitAndRetryOOM() { super(); }