Skip to content

Fixed GPU tests exec scripts and failing metrics (#3301) #6427

Fixed GPU tests exec scripts and failing metrics (#3301)

Fixed GPU tests exec scripts and failing metrics (#3301) #6427

Workflow file for this run

name: Run TPU tests
on:
push:
branches:
- master
- "*.*.*"
paths:
- "ignite/**"
- "tests/ignite/**"
- "tests/run_tpu_tests.sh"
- "tests/run_code_style.sh"
- "requirements-dev.txt"
- ".github/workflows/tpu-tests.yml"
pull_request:
paths:
- "ignite/**"
- "tests/ignite/**"
- "tests/run_tpu_tests.sh"
- "tests/run_code_style.sh"
- "requirements-dev.txt"
- ".github/workflows/tpu-tests.yml"
workflow_dispatch:
concurrency:
# <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)>
group: tpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
cancel-in-progress: true
jobs:
tpu-tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
xla-version: [nightly]
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
architecture: "x64"
- name: Get year & week number
id: get-date
run: echo "date=$(/bin/date "+%Y-%U")" >> $GITHUB_OUTPUT
shell: bash -l {0}
- name: Get pip cache dir
id: pip-cache
run: |
pip3 install -U "pip<24"
echo "pip_cache=$(pip cache dir)" >> $GITHUB_OUTPUT
shell: bash -l {0}
- uses: actions/cache@v3
with:
path: |
${{ steps.pip-cache.outputs.pip_cache }}
key: ${{ steps.get-date.outputs.date }}-pytorch-${{ runner.os }}-${{ matrix.xla-version }}-${{ hashFiles('requirements-dev.txt') }}
restore-keys: |
${{ steps.get-date.outputs.date }}-pytorch-${{ runner.os }}-${{ matrix.xla-version }}-
- name: Install Torch XLA and others
run: |
## Install mkl (alternative approach to https://github.com/pytorch/xla/blob/b0ba29f98a695671972d4a4cc07441014dba2892/.kokoro/common.sh#L31-L32)
sudo apt-get update && sudo apt-get install -y libopenblas-dev libomp5
pip install mkl==2021.4.0
## Install torch & xla and torchvision
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl
# Check installation
python -c "import torch"
## Install test deps and Ignite
pip install -r requirements-dev.txt
python setup.py install
# Download MNIST: https://github.com/pytorch/ignite/issues/1737
# to "/tmp" for tpu tests
- name: Download MNIST
uses: pytorch-ignite/download-mnist-github-action@master
with:
target_dir: /tmp
- name: Run Tests
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 25
shell: bash
command: |
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
env:
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
XRT_WORKERS: "localservice:0;grpc://localhost:40934"
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ./coverage.xml
flags: tpu
fail_ci_if_error: false