new 2407

dptech-corp · Feb 25, 2025 · f838ff9 · f838ff9
1 parent c679ff7
commit f838ff9
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 11 deletions.
diff --git a/.github/workflows/docker_rdma_latest.yml b/.github/workflows/docker_rdma_latest.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
     - main
+    - 2407
 
 jobs:
   docker:
@@ -56,8 +57,9 @@ jobs:
         name: Build and push with rdma
         uses: docker/build-push-action@v6
         with:
-          context: ./docker/rdma/
+          context: .
+          file: ./docker/rdma/Dockerfile
           push: true
           tags: |
-            dptechnology/unicore:latest-2407-pytorch2.4.0-cuda12.5-rdma
-            dp-ve-registry-cn-beijing.cr.volces.com/dplc/unicore:latest-2407-pytorch2.4.0-cuda12.5-rdma
+            dptechnology/unicore:2407-pytorch2.4.0-cuda12.5-rdma-tmp
+            dp-ve-registry-cn-beijing.cr.volces.com/dplc/unicore:2407-pytorch2.4.0-cuda12.5-rdma-tmp
diff --git a/docker/rdma/Dockerfile b/docker/rdma/Dockerfile
@@ -1,5 +1,9 @@
 FROM nvcr.io/nvidia/pytorch:24.07-py3
 
+WORKDIR /app
+# copy code to /app
+COPY . /app
+
 RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
     rm -rf /var/lib/apt/lists/* \
            /etc/apt/sources.list.d/cuda.list \
@@ -55,13 +59,11 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
 # # pytorch
 # # ------------------------------------------------------------------
 ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;9.0"
-RUN pip3 install --no-cache-dir --upgrade sentry-sdk requests ninja typing packaging wandb rdkit tokenizers lmdb ml-collections tensorboardX && rm -rf ~/.cache/pip
+RUN pip3 install --no-cache-dir --upgrade sentry-sdk requests ninja typing packaging wandb rdkit ase tokenizers lmdb ml-collections tensorboardX && rm -rf ~/.cache/pip
 
-RUN cd /tmp && \
-    git clone https://github.com/dptech-corp/Uni-Core && \
-    cd Uni-Core && \
-    python setup.py install --enable-cuda-ext && \
-    rm -rf  /tmp/* && rm -rf ~/.cache/pip
+# install unicore
+RUN python setup.py install --enable-cuda-ext && \
+    rm -rf /app/* && rm -rf ~/.cache/pip
 
 RUN pip3 install --no-cache-dir  biopython timeout-decorator urllib3 tree dm-tree && rm -rf ~/.cache/pip 
 

diff --git a/unicore/checkpoint_utils.py b/unicore/checkpoint_utils.py
@@ -248,7 +248,7 @@ def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=True):
     """
     local_path = path
     with open(local_path, "rb") as f:
-        state = torch.load(f, map_location=torch.device("cpu"))
+        state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
 
     if "args" in state and state["args"] is not None and arg_overrides is not None:
         args = state["args"]

diff --git a/unicore/distributed/utils.py b/unicore/distributed/utils.py
@@ -491,7 +491,7 @@ def _broadcast_object_slow(
         buffer = torch.ByteTensor(int(length.item())).to(dist_device)
         broadcast(buffer, src=src_rank, group=group)
         buffer = io.BytesIO(buffer.cpu().numpy())
-        obj = torch.load(buffer, map_location="cpu")
+        obj = torch.load(buffer, map_location="cpu", weights_only=False)
     return obj
 
 

diff --git a/unicore_cli/train.py b/unicore_cli/train.py
@@ -12,6 +12,7 @@
 import logging
 import math
 import os
+import time
 import sys
 from typing import Dict, Optional, Any, List, Tuple, Callable