Skip to content

Commit

Permalink
new 2407
Browse files Browse the repository at this point in the history
  • Loading branch information
guolinke committed Feb 25, 2025
1 parent c679ff7 commit f838ff9
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 11 deletions.
8 changes: 5 additions & 3 deletions .github/workflows/docker_rdma_latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- main
- 2407

jobs:
docker:
Expand Down Expand Up @@ -56,8 +57,9 @@ jobs:
name: Build and push with rdma
uses: docker/build-push-action@v6
with:
context: ./docker/rdma/
context: .
file: ./docker/rdma/Dockerfile
push: true
tags: |
dptechnology/unicore:latest-2407-pytorch2.4.0-cuda12.5-rdma
dp-ve-registry-cn-beijing.cr.volces.com/dplc/unicore:latest-2407-pytorch2.4.0-cuda12.5-rdma
dptechnology/unicore:2407-pytorch2.4.0-cuda12.5-rdma-tmp
dp-ve-registry-cn-beijing.cr.volces.com/dplc/unicore:2407-pytorch2.4.0-cuda12.5-rdma-tmp
14 changes: 8 additions & 6 deletions docker/rdma/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
FROM nvcr.io/nvidia/pytorch:24.07-py3

WORKDIR /app
# copy code to /app
COPY . /app

RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
rm -rf /var/lib/apt/lists/* \
/etc/apt/sources.list.d/cuda.list \
Expand Down Expand Up @@ -55,13 +59,11 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
# # pytorch
# # ------------------------------------------------------------------
ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;9.0"
RUN pip3 install --no-cache-dir --upgrade sentry-sdk requests ninja typing packaging wandb rdkit tokenizers lmdb ml-collections tensorboardX && rm -rf ~/.cache/pip
RUN pip3 install --no-cache-dir --upgrade sentry-sdk requests ninja typing packaging wandb rdkit ase tokenizers lmdb ml-collections tensorboardX && rm -rf ~/.cache/pip

RUN cd /tmp && \
git clone https://github.com/dptech-corp/Uni-Core && \
cd Uni-Core && \
python setup.py install --enable-cuda-ext && \
rm -rf /tmp/* && rm -rf ~/.cache/pip
# install unicore
RUN python setup.py install --enable-cuda-ext && \
rm -rf /app/* && rm -rf ~/.cache/pip

RUN pip3 install --no-cache-dir biopython timeout-decorator urllib3 tree dm-tree && rm -rf ~/.cache/pip

Expand Down
2 changes: 1 addition & 1 deletion unicore/checkpoint_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=True):
"""
local_path = path
with open(local_path, "rb") as f:
state = torch.load(f, map_location=torch.device("cpu"))
state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)

if "args" in state and state["args"] is not None and arg_overrides is not None:
args = state["args"]
Expand Down
2 changes: 1 addition & 1 deletion unicore/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def _broadcast_object_slow(
buffer = torch.ByteTensor(int(length.item())).to(dist_device)
broadcast(buffer, src=src_rank, group=group)
buffer = io.BytesIO(buffer.cpu().numpy())
obj = torch.load(buffer, map_location="cpu")
obj = torch.load(buffer, map_location="cpu", weights_only=False)
return obj


Expand Down
1 change: 1 addition & 0 deletions unicore_cli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import logging
import math
import os
import time
import sys
from typing import Dict, Optional, Any, List, Tuple, Callable

Expand Down

0 comments on commit f838ff9

Please # to comment.