From 6fcfafa28b02c87f0bdee5bdc1d2caee70707e39 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 27 Jun 2024 13:19:56 +0530 Subject: [PATCH] Update requirements, drop nan examples, fix nans in logging (#14) --- Dockerfile | 6 +++--- Dockerfile-notebook | 2 +- mlfoundry_utils.py | 10 +++++++++- requirements.txt | 6 +++--- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index e4fd816..58d0f0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ -# https://hub.docker.com/layers/winglian/axolotl/main-20240612-py3.11-cu121-2.3.0/images/sha256-798eed818fb11d24a640c0efbf27f65fbaebc1d9a5db210d585aa2a4328e93e1?context=explore -FROM --platform=linux/amd64 winglian/axolotl@sha256:aac52c92ab245793932a635e6dedf14a3a9fb009e40cdf16c10b715f1466afa8 +# https://hub.docker.com/layers/winglian/axolotl/main-20240626-py3.11-cu121-2.3.0/images/sha256-d157d1b80bfbbea689e9a4ea233d04bbc37f684f82e01d9dd6730dd0251e61fe?context=explore +FROM --platform=linux/amd64 winglian/axolotl@sha256:7945505e1651a474aa11ed4d70188ff5c5052e17f61bb5f60b956ad8f082328f USER root COPY requirements.txt /tmp/ RUN pip install -U pip wheel setuptools && \ @@ -9,7 +9,7 @@ RUN mkdir -p /packages && \ cd /packages && \ git clone https://github.com/truefoundry/axolotl && \ cd axolotl/ && \ - git checkout 5ba183d302ed1c91912555b76e423786acaccae8 + git checkout 99da242b9aee961acebaae99da8d615781f399e3 RUN cd /packages/axolotl/ && \ MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \ pip install --no-cache-dir -U -r /tmp/requirements.txt && \ diff --git a/Dockerfile-notebook b/Dockerfile-notebook index 71966c2..4b255a8 100644 --- a/Dockerfile-notebook +++ b/Dockerfile-notebook @@ -21,7 +21,7 @@ USER jovyan RUN cd /packages && \ git clone https://github.com/truefoundry/axolotl && \ cd axolotl/ && \ - git checkout 5ba183d302ed1c91912555b76e423786acaccae8 + git checkout 99da242b9aee961acebaae99da8d615781f399e3 RUN cd /packages/axolotl/ && \ MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \ pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt diff --git a/mlfoundry_utils.py b/mlfoundry_utils.py index 88c6f76..f08ece5 100644 --- a/mlfoundry_utils.py +++ b/mlfoundry_utils.py @@ -70,7 +70,9 @@ def log_model_to_mlfoundry( logger.warning("Python file in hf model cache in unknown path:", file_path) metadata.update({"huggingface_model_url": f"https://huggingface.co/{hf_hub_model_id}"}) - + metadata = { + k: v for k, v in metadata.items() if isinstance(v, (int, float, np.integer, np.floating)) and math.isfinite(v) + } run.log_model( name=model_name, model_file_or_folder=model_dir, @@ -171,6 +173,12 @@ def on_save(self, args, state, control, **kwargs): for log in state.log_history: if isinstance(log, dict) and log.get("step") == state.global_step: metadata = log.copy() + + metadata = { + k: v + for k, v in metadata.items() + if isinstance(v, (int, float, np.integer, np.floating)) and math.isfinite(v) + } self._run.log_artifact( name=self._checkpoint_artifact_name, artifact_paths=[(artifact_path,)], diff --git a/requirements.txt b/requirements.txt index 7e806c7..dc517d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ --extra-index-url https://download.pytorch.org/whl/cu121 cloud-files==4.15.2 -deepspeed @ git+https://github.com/truefoundry/DeepSpeed@1372f3d1937030f20283b6bfdb7209c55eb8a7bf +deepspeed @ git+https://github.com/microsoft/DeepSpeed@88b2ef71b3f2dfc42932cd2c097397f637ad77f4 pyarrow==15.0.0 rich>=13.0.0,<14 snowflake-connector-python[pandas]==3.7.0 torch==2.3.0+cu121 -truefoundry[ml]==0.2.4 -unsloth @ git+https://github.com/unslothai/unsloth@27fa021a7bb959a53667dd4e7cdb9598c207aa0d +truefoundry[ml]==0.2.8 +unsloth @ git+https://github.com/unslothai/unsloth@a558f22992813209ef9a369da8ef5163e9782258