Skip to content

Commit

Permalink
Stop everything on a Nan
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 16, 2024
1 parent eac3b10 commit 2f1664f
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 2 deletions.
6 changes: 5 additions & 1 deletion pdelfin/beakerpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,10 @@ async def process_line(line):
nonlocal last_running_req, last_queue_req, can_release_automatically, last_semaphore_release
sglang_logger.info(line)

if "Detected errors during sampling" in line:
logger.error("Cannot continue, sampling errors detected, model is probably corrupt")
sys.exit(1)

match = re.search(r'#running-req: (\d+)', line)
if match:
last_running_req = int(match.group(1))
Expand Down Expand Up @@ -721,7 +725,7 @@ async def main():
# Beaker/job running stuff
parser.add_argument('--beaker', action='store_true', help='Submit this job to beaker instead of running locally')
parser.add_argument('--beaker_workspace', help='Beaker workspace to submit to', default='ai2/pdelfin')
parser.add_argument('--beaker_cluster', help='Beaker clusters you want to run on', default=["ai2/jupiter-cirrascale-2", "ai2/pluto-cirrascale", "ai2/saturn-cirrascale"])
parser.add_argument('--beaker_cluster', help='Beaker clusters you want to run on', default=["ai2/jupiter-cirrascale-2", "ai2/pluto-cirrascale", "ai2/saturn-cirrascale", "ai2/augusta-google-1"])
parser.add_argument('--beaker_gpus', type=int, default=1, help="Number of gpu replicas to run")
parser.add_argument('--beaker_priority', type=str, default="normal", help="Beaker priority level for the job")
args = parser.parse_args()
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "23"
_PATCH = "24"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down

0 comments on commit 2f1664f

Please # to comment.