continue-chain.sh

#!/bin/bash

# Pick a file and run it.  This mostly implements a naming convention
# to simplify runing several chains in parallel.  It expects that all
# of the jobs are going to be run in a single directory.  The script
# works fine with a single chain, but is intended to handle lots (and
# lots) of parallel chains.  The intended use cases are when you want
# run many jobs in sequence without starting jobs by hand, or when you
# have access to a cluster that can run several of parallel chains and
# you want to continue them as new jobs can be run.
#
#   continue-chain.sh [-N] [-D output-directory] basefile -- command
#
#   Option:
#
#     -N -- Start a new chain.  If the base file doesn't exist, then
#        :INPUT: will be set to "none".
#
#     -D <dir> -- Specify an output directory
#
#   Arguments:
#
#     basefile -- This is the name of a starting point for the chain.
#        It can be a file that has been generated by TSimpleMCMC, and
#        which contains the initial burn-in of the chain.  It is
#        typically going to be named something like [prefix].root
#        where the prefix is chosen by the user.  The file doesn't
#        need to exist.  If the file doesn't exist, then the ".root"
#        extension is optional.
#
#     command -- The command to be run.  This is everything after the
#        first double dash ("--") and should include all of the
#        appropriate options.  The output command will be built by
#        replacing ":INPUT:" with the intended input filename, and
#        ":OUTPUT: for the intended output file name.
#
# This is usually run as part of a batch job (e.g. in a slurm script),
# but here is an example of how it might be run by hand.
#
# # Make three starting point files
#
# $ continue-chain.sh -N SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
# $ continue-chain.sh -N SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
# $ continue-chain.sh -N SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
#
# Run three more times and continue the chains.  Notice the lack
# of "-N" option to continue-chain.sh, and that the command
# remains unchainged.
#
# $ for i in 1 2 3; do
# $   continue-chain.sh SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
# $ done
#
# This produced output like
#
# $ ls
# SimpleMCMC_001_4800f386_4daeb417.closed.root
# SimpleMCMC_001_c72d6f54_6b8553dd.closed.root
# SimpleMCMC_001_6db1a3aa_c34e97cb.closed.root
# SimpleMCMC_002_4daeb417_abf3ef80.open.root
# SimpleMCMC_002_6b8553dd_d3e878a8.open.root
# SimpleMCMC_002_c34e97cb_ffd79c45.open.root
# $
#
# The important part of the names are the "SimpleMCMC" which comes
# from the name of the input file on the command line.  It's the first
# input file for the chain.  The other important part is the first
# number "000" to "002" which gives the epoch of te file.  The 000
# epoch is just a hard link to the original input file
# (i.e. SimpleMCMC.root), and is automatically deleted.
#
# Depending on the number of steps in each file, and the input
# configuration, epoch 001 might be treated as burn-in and the other
# files (i.e. "*.closed.root" and "*.open.root") can be added together
# to form a posterior distribution.  The rest of the name is there to
# help with forensic debugging.
#
# The File naming convention is:
#     prefix_epoch_parent-md5_child-md5.suffix
#
#  Prefix -- based on what the user provides.  This is for different
#  types of chains.
#
#  Epoch -- The number of "epochs" between this file and the first
#  file of the particular chain.  This is 3 characters (decimal)
#
#  Parent -- The md5sum value for the parent file.  This is the file
#  before the current file in the chain.  This is 8 characters
#
#  Child -- The md5sum value for this file.  This is 8 characters
#
#  Suffix -- One of
#     .open.root    -- This is the last file in a chain and can be extended
#     .closed.root  -- This is not the last file in a chain. Don't extend it
#     .input.root   -- An file being used as input for a job.
#     .running.root -- An output file being generated by a job
#
# When started, this will rename one "[blah].open.root" to
# "[blah].closed.root" and link it to "[blah].input.root" and run the
# job with output to "[blah].running.root".  When the job finishes
# successfully "[blah].input.root" is removed and [blah].running.root
# is renamed to "[blah].open.root".  If the job fails,
# "[blah].closed.root" is renamed back to "[blah].open.root".
#

MAKE_NEW_FILE="maybe"

#####################################
# Handle any input options
TEMP=$(getopt -o 'ND:' -n "$0" -- "$@")
if [ $? -ne 0 ]; then
    echo "Error ..."
    exit 1
fi
eval set -- "$TEMP"
unset TEMP
while true; do
    case "$1" in
        # Force a new epoch 001 file to be started.
	'-N')
            echo Starting a new chain.
            MAKE_NEW_FILE=yes
	    shift
	    continue;;
        '-D')
            shift
            DIRECTORY=$(realpath $1)
            shift
            continue;;
	'--')
	    shift
	    break;
    esac
done

#####################################
# Get the base file name that provides the prefix.  It doesn't need to
# exist, but if it does, then it provides the starting point for the
# chain.
BASEFILE=$1
shift

BASENAME=$(basename ${BASEFILE})
if [ ${#DIRECTORY} = 0 ]; then
    DIRECTORY=$(realpath $(dirname ${BASEFILE}))
fi

if [ ! -x ${DIRECTORY} ]; then
   echo Output directory does not exist.
   exit 1
fi

####################################
# Get the executable you want to run.  This is a good place to modify
# and hardcode which program is going to run (copy continue-chain.sh
# to a new name).
COMMAND="$*"

####################################
# Make a (random) string that is unique to this job
if [ -x /usr/bin/uuidgen ] ; then
    echo Using UUID to make a unique string
    UNIQUE=$(uuidgen | md5sum | cut -c 1-32)
else
    echo Using nanoseconds since 1 Jan 1970 to make a unique string
    UNIQUE=$(echo $(uname -a) $(date +%s%N) | md5sum | cut -c 1-32)
fi

###################################
# Find an input file to read.
ORIGINAL=missing
CLOSED=missing
INPUT=missing
while true; do
    for file in ${DIRECTORY}/${BASENAME}_*.open.root; do
        if [ "x${MAKE_NEW_FILE}" == "xyes" ]; then
            break
        fi
        # Use mv to see if we can become the "owner" of the file.
        # This depends on linux mv being atomic for renaming files
        # within a single directory.  That's a slightly dangerous
        # assumption on distributed file systems, so this sleeps for a
        # moment after the move to try and let things stabilize.
        TRY=$(dirname ${file})/${UNIQUE}$(basename ${file})
        mv ${file} ${TRY} || true
        # Pause a moment
        sleep 1
        # Check if we got the file.  If we did, then move the "open"
        # input file to "closed" so it's marked as having been used.
        if [ -f ${TRY} ]; then
            ORIGINAL=${file}
            CLOSED=$(echo ${file} | sed s/.open.root/.closed.root/)
            mv ${TRY} ${CLOSED}
            break
        fi
    done
    if [ -f ${CLOSED} ]; then
        # The input file was found, so all is good.
        break
    fi
    echo Check if we should make a new chain
    if [ "x${MAKE_NEW_FILE}" != "xno" ]; then
        echo No file to continue.  Try to make a new chain.
        MAKE_NEW_FILE=yes
    fi
    # There isn't an existing open chain file.  Check if we can make a
    # new one or exit with an error.
    if [ "x${MAKE_NEW_FILE}" != "xyes" ]; then
        echo You need to run this with -N first, or jobs need to finish.
        exit 1
    fi
    echo Making a new chain
    # Only one try at making a new chain.
    MAKE_NEW_FILE=no
    # Link the base file into an "open file" with epoch zero and try
    # again.  The base should be named "${PREFIX}[.root]".  If the
    # name contains "_" everything before the first one will become
    # the prefix.
    PREFIX=${DIRECTORY}/$(echo ${BASENAME} | cut -d_ -f 1 | sed s/.root//)
    NEWFILE=${PREFIX}_000_00000000_$(echo ${UNIQUE} | cut -c 1-8).open.root
    if [ -f ${BASEFILE} ]; then
        # The basefile names an actual file, so link it to the "open" file.
        ln -s $(realpath ${BASEFILE}) ${NEWFILE}
    else
        # The basefile doesn't exist so it is just providing a prefix.
        # Create an empty "open" file and flag it as fake.
        touch ${NEWFILE}
        FAKEFILE="yes"
        BASEFILE=""
    fi
    # Don't loop quickly.  A new "open" chain has been created by
    # either linking to the basefile, or creating a new fake chain
    # using touch.  Sleep before looking for the open chain to process
    # in case this is running on a distributed file system which might
    # take a "few milliseconds" to exchange the updated state.
    sleep 1
done

# A sanity check.
if [ ! -f ${CLOSED} ]; then
    echo "The input file ${CLOSED} is missing!?!"
    exit 1
fi

################################################
# Parse the fields in the input file name
PREFIX=$(echo ${CLOSED} | cut -d_ -f 1)
OLDEPOCH=$(echo ${CLOSED} | cut -d_ -f 2)
GRANDPARENT=$(echo ${CLOSED} | cut -d_ -f 3)
PARENT=$(echo ${CLOSED} | cut -d_ -f 4 | sed s/\\..*//g )

# Check the that input file is properly named!
CHECK=${PREFIX}_${OLDEPOCH}_${GRANDPARENT}_${PARENT}
if [ ! -f ${CHECK}.closed.root ]; then
    echo Problem with input file: ${CLOSED}
    echo Parsed fields: ${CHECK}
    echo From original file: ${ORIGINAL}
    exit 1
fi

################################################
# Generate the location of the temporary input file.
INPUT=$(echo ${CLOSED} | sed s/.closed.root/.input.root/)

# Link the final (closed) location to the temporary input name
if [ -f ${CLOSED} ]; then
    ln -s ${CLOSED} ${INPUT}
fi

# Check if the input file is a real file and remove it if it isn't
if [ "x${FAKEFILE}" = "xyes" ]; then
    rm ${CLOSED}
    rm ${INPUT}
    INPUT="none"
fi

# Generate the new epoch for the file.
NEWEPOCH=$(printf %3.3d $(( $(echo ${OLDEPOCH} | sed s/0*//) + 1)) )

# Make the temporary name for the running file
RUNNING=${PREFIX}_${NEWEPOCH}_${PARENT}_XXXXXXXX.running.root

echo "#####################################################"
echo "Command:         " ${COMMAND}
echo "Original file:   " ${ORIGINAL}
echo "Temporary input: " ${INPUT}
echo "Temporary output:" ${RUNNING}
COMMAND_LINE=$(echo ${COMMAND} | \
                   sed s%:INPUT:%${INPUT}%g | \
                   sed s%:OUTPUT:%${RUNNING}%g)
echo "Command Line:    " ${COMMAND_LINE}
echo "Start time:      " $(date)
echo "#####################################################"

#################################################
# Run the job!  Don't let it fail
time ${COMMAND_LINE} || ([ -f ${RUNNING} ] && rm ${RUNNING})

echo "#####################################################"
echo "End time:        " $(date)

# Check if the output file was generated.  If there was no output,
# then reopen the "closed" input file.
if [ ! -f ${RUNNING} ]; then
    echo Missing output file: ${RUNNING}
    echo Original file: ${ORIGINAL}
    if [ -f ${INPUT} ]; then
        rm ${INPUT}
    fi
    # Move the closed file back to the original file name (but don't
    # overwrite original if it exists.
    if [ ! -f ${ORIGINAL} -a -f ${CLOSED} ]; then
        mv ${CLOSED} ${ORIGINAL}
    fi
    exit 1
fi

# Find the checksum value for the new file
CHILD=$(md5sum ${RUNNING} | cut -c 1-8)

# Make the final name for the file that was just run.
OPEN=${PREFIX}_${NEWEPOCH}_${PARENT}_${CHILD}.open.root
echo "New file:        " ${OPEN}

# Move the temporary output file to it's final file name.
mv ${RUNNING} ${OPEN}

# Remove the temporary input file
if [ -f ${INPUT} ]; then
    rm ${INPUT}
fi