Upgrade cuML and cuDF (#1395)

calderjo · web-flow · commit 34e9a4a2584b · 2024-05-24T15:18:50.000-07:00
upgrade RAPIDS to 24.4.x (cuML and cuDF). - looks like installing via mamba was not working, installed using conda with mamba package solver (which is faster than plan conda). - Added test for cuML and cuDF, these packages were reported as "broken" for last the 6 months. - Skip certain test if running on a p100 gpu Going forward, rapids will not be compatible with p100 GPUs, that's a problem for another day https://chat.kaggle.net/kaggle/pl/85tczsc4w3nhijkd1ftryxr7yo b/296444923 b/341938540
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -99,6 +99,7 @@ ENV PROJ_LIB=/opt/conda/share/proj
 # the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/
 RUN conda config --add channels nvidia && \
     conda config --add channels rapidsai && \
+    conda config --set solver libmamba && \
     # b/299991198 remove curl/libcurl install once DLVM base image includes version >= 7.86
     conda install -c conda-forge mamba curl libcurl && \
     # Base image channel order: conda-forge (highest priority), defaults.
@@ -107,24 +108,17 @@ RUN conda config --add channels nvidia && \
     /tmp/clean-layer.sh
 
 # Install spacy
+# b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version.
+# b/341938540: unistall grpc-cpp to allow >=v24.4 cudf and cuml to be installed.
 {{ if eq .Accelerator "gpu" }}
-RUN mamba install -y -c conda-forge spacy cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
+RUN pip uninstall -y pyarrow && \
+    mamba remove -y --force grpc-cpp && \
+    mamba install -y -c conda-forge spacy cudf>=24.4 cuml>=24.4 cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
     /tmp/clean-layer.sh
 {{ else }}
 RUN pip install spacy && \
     /tmp/clean-layer.sh
 {{ end}}
-{{ if eq .Accelerator "gpu" }}
-
-# b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version.
-RUN pip uninstall -y pyarrow && \
-    mamba install -y cudf cuml && \
-    /tmp/clean-layer.sh
-
-# TODO: b/296444923 - Resolve pandas dependency another way
-RUN sed -i 's/^is_extension_type/# is_extension_type/g' /opt/conda/lib/python3.10/site-packages/cudf/api/types.py \
-    && sed -i 's/^is_categorical/# is_categorical/g' /opt/conda/lib/python3.10/site-packages/cudf/api/types.py
-{{ end }}
 
 # Install PyTorch
 {{ if eq .Accelerator "gpu" }}
diff --git a/tests/common.py b/tests/common.py
@@ -2,6 +2,16 @@
 
 import os
 import unittest
+import subprocess
+
+def getAcceleratorName():
+    try:
+        deviceName = subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'])
+        return deviceName.decode('utf-8').strip()
+    except FileNotFoundError:
+        return("nvidia-smi not found.")
 
 gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests')
+# b/342143152 P100s are slowly being unsupported in new release of popular ml tools such as RAPIDS. 
+p100_exempt = unittest.skipIf(getAcceleratorName() == "Tesla P100-PCIE-16GB", 'Not running p100 exempt tests')
 tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests')
diff --git a/tests/test_cudf.py b/tests/test_cudf.py
@@ -0,0 +1,20 @@
+import unittest
+
+from common import gpu_test, p100_exempt
+
+
+class TestCudf(unittest.TestCase):
+    @gpu_test
+    @p100_exempt # b/342143152: cuDL(>=24.4v) is inompatible with p100 GPUs.
+    def test_cudf_dataframe_operations(self):
+        import cudf
+
+        data = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
+        gdf = cudf.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})
+
+        gdf['col3'] = gdf['col1'] + gdf['col2']
+
+        expected_col3 = cudf.Series([5, 7, 9])
+        self.assertEqual(gdf.shape, (3, 3))
+        self.assertEqual(list(gdf.columns), ['col1', 'col2', 'col3'])
+        self.assertTrue(gdf['col3'].equals(expected_col3))
diff --git a/tests/test_cuml.py b/tests/test_cuml.py
@@ -0,0 +1,19 @@
+import unittest
+
+from common import gpu_test, p100_exempt
+
+
+class TestCuml(unittest.TestCase):
+    @gpu_test
+    @p100_exempt # b/342143152: cuML(>=24.4v) is inompatible with p100 GPUs.
+    def test_pca_fit_transform(self):
+        import unittest
+        import numpy as np
+        from cuml.decomposition import PCA
+
+        x = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0], [-1.0, -2.0], [-2.0, -4.0]])  
+        pca = PCA(n_components=1)
+
+        x_transformed = pca.fit_transform(x)
+
+        self.assertEqual(x_transformed.shape, (5, 1))
diff --git a/tests/test_datashader.py b/tests/test_datashader.py
@@ -1,13 +1,17 @@
 import unittest
 
-import numpy as np
-import pandas as pd
-import datashader as ds
-import datashader.transfer_functions as tf
+from common import p100_exempt
 
 class TestDatashader(unittest.TestCase):
-    # based on https://github.com/pyviz/datashader/blob/master/datashader/tests/test_pipeline.py
+
+    @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs.
     def test_pipeline(self):
+        # based on https://github.com/pyviz/datashader/blob/master/datashader/tests/test_pipeline.py
+        import numpy as np
+        import pandas as pd
+        import datashader as ds
+        import datashader.transfer_functions as tf
+
         df = pd.DataFrame({
             'x': np.array(([0.] * 10 + [1] * 10)),
             'y': np.array(([0.] * 5 + [1] * 5 + [0] * 5 + [1] * 5)),
diff --git a/tests/test_geoviews.py b/tests/test_geoviews.py
@@ -1,11 +1,16 @@
 import unittest
 
-import geoviews.feature as gf
-import holoviews as hv
-from cartopy import crs
+from common import p100_exempt
 
 class TestGeoviews(unittest.TestCase):
+
+    @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs.
+
     def test_viz(self):
+        import geoviews.feature as gf
+        import holoviews as hv
+        from cartopy import crs
+
         hv.extension('matplotlib')
         (gf.ocean + gf.land + gf.ocean * gf.land * gf.coastline * gf.borders).options(
             'Feature', projection=crs.Geostationary(), global_extent=True