Skip to content

Commit 6e22a43

Browse files
committed
Add OSS GPU tests
ghstack-source-id: bf0b9661ac3569e0e9b5ffd103575a37b8148d97 Pull Request resolved: #231
1 parent f5d0290 commit 6e22a43

File tree

4 files changed

+125
-22
lines changed

4 files changed

+125
-22
lines changed

Diff for: .github/scripts/install_nvidia_utils_linux.sh

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
et -eou pipefail
2+
3+
4+
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
5+
DRIVER_VERSION="515.57"
6+
DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
7+
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
8+
9+
install_nvidia_docker2_amzn2() {
10+
(
11+
set -x
12+
# Needed for yum-config-manager
13+
sudo yum install -y yum-utils
14+
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
15+
sudo yum install -y nvidia-docker2
16+
sudo systemctl restart docker
17+
)
18+
}
19+
20+
install_nvidia_driver_amzn2() {
21+
(
22+
set -x
23+
24+
# Purge any nvidia driver installed from RHEL repo
25+
sudo yum remove -y nvidia-driver-latest-dkms
26+
27+
HAS_NVIDIA_DRIVER=0
28+
# Check if NVIDIA driver has already been installed
29+
if [ -x "$(command -v nvidia-smi)" ]; then
30+
# The driver exists, check its version next
31+
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
32+
33+
if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
34+
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
35+
else
36+
HAS_NVIDIA_DRIVER=1
37+
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
38+
fi
39+
fi
40+
41+
if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
42+
sudo yum groupinstall -y "Development Tools"
43+
# ensure our kernel install is the same as our underlying kernel,
44+
# groupinstall "Development Tools" has a habit of mismatching kernel headers
45+
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
46+
sudo modprobe backlight
47+
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
48+
sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
49+
sudo rm -fv /tmp/nvidia_driver
50+
fi
51+
52+
(
53+
set +e
54+
nvidia-smi
55+
status=$?
56+
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
57+
if [ $status -eq 0 ] || [ $status -eq 14 ]; then
58+
echo "INFO: Ignoring allowed status ${status}"
59+
else
60+
echo "ERROR: nvidia-smi exited with unresolved status ${status}"
61+
exit ${status}
62+
fi
63+
)
64+
)
65+
}
66+
67+
echo "== Installing nvidia driver ${DRIVER_FN} =="
68+
case "${DISTRIBUTION}" in
69+
amzn*)
70+
install_nvidia_driver_amzn2
71+
;;
72+
*)
73+
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
74+
exit 1
75+
;;
76+
esac
77+
78+
# Install container toolkit based on distribution
79+
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
80+
case "${DISTRIBUTION}" in
81+
amzn*)
82+
install_nvidia_docker2_amzn2
83+
;;
84+
*)
85+
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
86+
exit 1
87+
;;
88+
esac
89+

Diff for: .github/workflows/runtime_tests.yaml

+16-4
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,27 @@ jobs:
1313
matrix:
1414
python-major-version: [3]
1515
python-minor-version: [7,8,9,10]
16-
platform: [ubuntu-18.04]
16+
platform: [linux.4xlarge.nvidia.gpu]
1717
fail-fast: false
1818
runs-on: ${{ matrix.platform }}
1919
steps:
2020
- name: Checkout MultiPy
2121
uses: actions/checkout@v2
2222
with:
2323
submodules: true
24-
24+
- name: Clean up previous CUDA driver installations
25+
shell: bash
26+
run: |
27+
set -x
28+
yum list installed | grep nvidia || true
29+
yum list installed | grep cuda || true
30+
sudo yum remove -y cuda || true
31+
sudo yum remove -y cuda-drivers || true
32+
sudo yum remove -y "*nvidia*" || true
33+
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
34+
run: |
35+
bash .github/workflows/install_nvidia_utils_linux.sh || true
36+
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
2537
- name: Setup SSH (Click me for login details)
2638
uses: ./.github/actions/setup-ssh
2739
with:
@@ -30,11 +42,11 @@ jobs:
3042
- name: Build
3143
env:
3244
DOCKER_BUILDKIT: 1
33-
run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} .
45+
run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} --build-arg BUILD_CUDA_TESTS=1 .
3446

3547
- name: Test
3648
run: |
37-
docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy"
49+
docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy && multipy/runtime/build/test_deploy_gpu"
3850
3951
- name: Examples
4052
run: |

Diff for: Dockerfile

+12-14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASE_IMAGE=nvidia/cuda:11.3.1-devel-ubuntu18.04
1+
ARG BASE_IMAGE=nvidia/cuda:11.6.1-devel-ubuntu18.04
22

33
FROM ${BASE_IMAGE} as dev-base
44

@@ -59,13 +59,17 @@ COPY .git .git
5959
COPY .gitmodules .gitmodules
6060
COPY multipy multipy
6161
COPY compat-requirements.txt compat-requirements.txt
62+
COPY setup.py setup.py
63+
COPY README.md README.md
64+
COPY dev-requirements.txt dev-requirements.txt
6265

6366
RUN git submodule update --init --recursive --jobs 0
6467

6568
# Install conda/pyenv + necessary python dependencies
6669
FROM dev-base as conda-pyenv
6770
ARG PYTHON_MAJOR_VERSION=3
6871
ARG PYTHON_MINOR_VERSION=8
72+
ARG BUILD_CUDA_TESTS=0
6973
ENV PYTHON_MINOR_VERSION=${PYTHON_MINOR_VERSION}
7074
ENV PYTHON_VERSION=${PYTHON_MAJOR_VERSION}.${PYTHON_MINOR_VERSION}
7175
RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
@@ -75,7 +79,7 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
7579
rm ~/miniconda.sh && \
7680
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} mkl mkl-include conda-build pyyaml numpy ipython && \
7781
/opt/conda/bin/conda install -y -c conda-forge libpython-static=${PYTHON_VERSION} && \
78-
/opt/conda/bin/conda install -y pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch-nightly && \
82+
/opt/conda/bin/conda install -y pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch-nightly -c nvidia && \
7983
/opt/conda/bin/conda clean -ya; \
8084
else \
8185
pip3 install virtualenv && \
@@ -84,29 +88,23 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
8488
~/.pyenv/bin/pyenv install --force 3.7.10 && \
8589
virtualenv -p ~/.pyenv/versions/3.7.10/bin/python3 ~/venvs/multipy && \
8690
source ~/venvs/multipy/bin/activate && \
87-
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113; \
91+
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116; \
8892
fi
8993

90-
# Build/Install pytorch with post-cxx11 ABI
9194
FROM conda-pyenv as build
92-
WORKDIR /opt/multipy/multipy/runtime/third-party/pytorch
9395
COPY --from=conda-pyenv /opt/conda* /opt/conda
9496
COPY --from=submodule-update /opt/multipy /opt/multipy
9597

9698
WORKDIR /opt/multipy
9799

98100
# Build Multipy
99-
RUN rm -r multipy/runtime/build; mkdir multipy/runtime/build && \
100-
cd multipy/runtime/build && \
101-
if [[ ${PYTHON_MINOR_VERSION} -lt 8 ]]; then \
102-
source ~/venvs/multipy/bin/activate && \
103-
cmake -DLEGACY_PYTHON_PRE_3_8=ON ..; \
101+
RUN ls && pwd && rm -rf multipy/runtime/build && \
102+
if [[ ${BUILD_CUDA_TESTS} -eq 1 ]]; then \
103+
python -m pip install -e . --install-option="--cudatests"; \
104104
else \
105-
cmake -DLEGACY_PYTHON_PRE_3_8=OFF ..; \
105+
python -m pip install -e .; \
106106
fi && \
107-
cmake --build . --config Release -j && \
108-
cmake --install . --prefix "." && \
109-
cd ../example && python generate_examples.py
107+
python multipy/runtime/example/generate_examples.py
110108

111109
# Build examples
112110
COPY examples examples

Diff for: setup.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def get_cmake_version():
2828

2929

3030
class MultipyRuntimeCmake(object):
31-
user_options = [("cmakeoff", None, None), ("abicxx", None, None)]
31+
user_options = [("cmakeoff", None, None), ("cudatests", None, None), ("abicxx", None, None)]
3232

3333

3434
class MultipyRuntimeDevelop(MultipyRuntimeCmake, develop):
@@ -41,24 +41,28 @@ def initialize_options(self):
4141
# TODO(tristanr): remove once unused
4242
self.abicxx = None
4343

44+
self.cudatests = None
4445
def finalize_options(self):
4546
develop.finalize_options(self)
4647
if self.cmakeoff is not None:
4748
self.distribution.get_command_obj("build_ext").cmake_off = True
49+
if self.cudatests is not None:
50+
self.distribution.get_command_obj("build_ext").cuda_tests_flag = "ON"
4851

4952

5053
class MultipyRuntimeBuild(MultipyRuntimeCmake, build_ext):
5154
user_options = build_ext.user_options + MultipyRuntimeCmake.user_options
5255
cmake_off = False
56+
cuda_tests_flag = "OFF"
5357

5458
def run(self):
5559
if self.cmake_off:
5660
return
5761
try:
5862
cmake_version_comps = get_cmake_version().split(".")
59-
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "19":
63+
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "12":
6064
raise RuntimeError(
61-
"CMake 3.19 or later required for multipy runtime installation."
65+
"CMake 3.12 or later required for multipy runtime installation."
6266
)
6367
except OSError:
6468
raise RuntimeError(
@@ -74,7 +78,7 @@ def run(self):
7478
print(f"-- Running multipy runtime makefile in dir {build_dir_abs}")
7579
try:
7680
subprocess.run(
77-
[f"cmake -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
81+
[f"cmake -DBUILD_CUDA_TESTS={self.cuda_tests_flag} -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
7882
cwd=build_dir_abs,
7983
shell=True,
8084
check=True,

0 commit comments

Comments
 (0)