-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcontinue-chain.sh
executable file
·331 lines (306 loc) · 11.4 KB
/
continue-chain.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/bin/bash
# Pick a file and run it. This mostly implements a naming convention
# to simplify runing several chains in parallel. It expects that all
# of the jobs are going to be run in a single directory. The script
# works fine with a single chain, but is intended to handle lots (and
# lots) of parallel chains. The intended use cases are when you want
# run many jobs in sequence without starting jobs by hand, or when you
# have access to a cluster that can run several of parallel chains and
# you want to continue them as new jobs can be run.
#
# continue-chain.sh [-N] [-D output-directory] basefile -- command
#
# Option:
#
# -N -- Start a new chain. If the base file doesn't exist, then
# :INPUT: will be set to "none".
#
# -D <dir> -- Specify an output directory
#
# Arguments:
#
# basefile -- This is the name of a starting point for the chain.
# It can be a file that has been generated by TSimpleMCMC, and
# which contains the initial burn-in of the chain. It is
# typically going to be named something like [prefix].root
# where the prefix is chosen by the user. The file doesn't
# need to exist. If the file doesn't exist, then the ".root"
# extension is optional.
#
# command -- The command to be run. This is everything after the
# first double dash ("--") and should include all of the
# appropriate options. The output command will be built by
# replacing ":INPUT:" with the intended input filename, and
# ":OUTPUT: for the intended output file name.
#
# This is usually run as part of a batch job (e.g. in a slurm script),
# but here is an example of how it might be run by hand.
#
# # Make three starting point files
#
# $ continue-chain.sh -N SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
# $ continue-chain.sh -N SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
# $ continue-chain.sh -N SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
#
# Run three more times and continue the chains. Notice the lack
# of "-N" option to continue-chain.sh, and that the command
# remains unchainged.
#
# $ for i in 1 2 3; do
# $ continue-chain.sh SimpleMCMC.root -- mcmc.exe 5 10000 :OUTPUT: :INPUT:
# $ done
#
# This produced output like
#
# $ ls
# SimpleMCMC_001_4800f386_4daeb417.closed.root
# SimpleMCMC_001_c72d6f54_6b8553dd.closed.root
# SimpleMCMC_001_6db1a3aa_c34e97cb.closed.root
# SimpleMCMC_002_4daeb417_abf3ef80.open.root
# SimpleMCMC_002_6b8553dd_d3e878a8.open.root
# SimpleMCMC_002_c34e97cb_ffd79c45.open.root
# $
#
# The important part of the names are the "SimpleMCMC" which comes
# from the name of the input file on the command line. It's the first
# input file for the chain. The other important part is the first
# number "000" to "002" which gives the epoch of te file. The 000
# epoch is just a hard link to the original input file
# (i.e. SimpleMCMC.root), and is automatically deleted.
#
# Depending on the number of steps in each file, and the input
# configuration, epoch 001 might be treated as burn-in and the other
# files (i.e. "*.closed.root" and "*.open.root") can be added together
# to form a posterior distribution. The rest of the name is there to
# help with forensic debugging.
#
# The File naming convention is:
# prefix_epoch_parent-md5_child-md5.suffix
#
# Prefix -- based on what the user provides. This is for different
# types of chains.
#
# Epoch -- The number of "epochs" between this file and the first
# file of the particular chain. This is 3 characters (decimal)
#
# Parent -- The md5sum value for the parent file. This is the file
# before the current file in the chain. This is 8 characters
#
# Child -- The md5sum value for this file. This is 8 characters
#
# Suffix -- One of
# .open.root -- This is the last file in a chain and can be extended
# .closed.root -- This is not the last file in a chain. Don't extend it
# .input.root -- An file being used as input for a job.
# .running.root -- An output file being generated by a job
#
# When started, this will rename one "[blah].open.root" to
# "[blah].closed.root" and link it to "[blah].input.root" and run the
# job with output to "[blah].running.root". When the job finishes
# successfully "[blah].input.root" is removed and [blah].running.root
# is renamed to "[blah].open.root". If the job fails,
# "[blah].closed.root" is renamed back to "[blah].open.root".
#
MAKE_NEW_FILE="maybe"
#####################################
# Handle any input options
TEMP=$(getopt -o 'ND:' -n "$0" -- "$@")
if [ $? -ne 0 ]; then
echo "Error ..."
exit 1
fi
eval set -- "$TEMP"
unset TEMP
while true; do
case "$1" in
# Force a new epoch 001 file to be started.
'-N')
echo Starting a new chain.
MAKE_NEW_FILE=yes
shift
continue;;
'-D')
shift
DIRECTORY=$(realpath $1)
shift
continue;;
'--')
shift
break;
esac
done
#####################################
# Get the base file name that provides the prefix. It doesn't need to
# exist, but if it does, then it provides the starting point for the
# chain.
BASEFILE=$1
shift
BASENAME=$(basename ${BASEFILE})
if [ ${#DIRECTORY} = 0 ]; then
DIRECTORY=$(realpath $(dirname ${BASEFILE}))
fi
if [ ! -x ${DIRECTORY} ]; then
echo Output directory does not exist.
exit 1
fi
####################################
# Get the executable you want to run. This is a good place to modify
# and hardcode which program is going to run (copy continue-chain.sh
# to a new name).
COMMAND="$*"
####################################
# Make a (random) string that is unique to this job
if [ -x /usr/bin/uuidgen ] ; then
echo Using UUID to make a unique string
UNIQUE=$(uuidgen | md5sum | cut -c 1-32)
else
echo Using nanoseconds since 1 Jan 1970 to make a unique string
UNIQUE=$(echo $(uname -a) $(date +%s%N) | md5sum | cut -c 1-32)
fi
###################################
# Find an input file to read.
ORIGINAL=missing
CLOSED=missing
INPUT=missing
while true; do
for file in ${DIRECTORY}/${BASENAME}_*.open.root; do
if [ "x${MAKE_NEW_FILE}" == "xyes" ]; then
break
fi
# Use mv to see if we can become the "owner" of the file.
# This depends on linux mv being atomic for renaming files
# within a single directory. That's a slightly dangerous
# assumption on distributed file systems, so this sleeps for a
# moment after the move to try and let things stabilize.
TRY=$(dirname ${file})/${UNIQUE}$(basename ${file})
mv ${file} ${TRY} || true
# Pause a moment
sleep 1
# Check if we got the file. If we did, then move the "open"
# input file to "closed" so it's marked as having been used.
if [ -f ${TRY} ]; then
ORIGINAL=${file}
CLOSED=$(echo ${file} | sed s/.open.root/.closed.root/)
mv ${TRY} ${CLOSED}
break
fi
done
if [ -f ${CLOSED} ]; then
# The input file was found, so all is good.
break
fi
echo Check if we should make a new chain
if [ "x${MAKE_NEW_FILE}" != "xno" ]; then
echo No file to continue. Try to make a new chain.
MAKE_NEW_FILE=yes
fi
# There isn't an existing open chain file. Check if we can make a
# new one or exit with an error.
if [ "x${MAKE_NEW_FILE}" != "xyes" ]; then
echo You need to run this with -N first, or jobs need to finish.
exit 1
fi
echo Making a new chain
# Only one try at making a new chain.
MAKE_NEW_FILE=no
# Link the base file into an "open file" with epoch zero and try
# again. The base should be named "${PREFIX}[.root]". If the
# name contains "_" everything before the first one will become
# the prefix.
PREFIX=${DIRECTORY}/$(echo ${BASENAME} | cut -d_ -f 1 | sed s/.root//)
NEWFILE=${PREFIX}_000_00000000_$(echo ${UNIQUE} | cut -c 1-8).open.root
if [ -f ${BASEFILE} ]; then
# The basefile names an actual file, so link it to the "open" file.
ln -s $(realpath ${BASEFILE}) ${NEWFILE}
else
# The basefile doesn't exist so it is just providing a prefix.
# Create an empty "open" file and flag it as fake.
touch ${NEWFILE}
FAKEFILE="yes"
BASEFILE=""
fi
# Don't loop quickly. A new "open" chain has been created by
# either linking to the basefile, or creating a new fake chain
# using touch. Sleep before looking for the open chain to process
# in case this is running on a distributed file system which might
# take a "few milliseconds" to exchange the updated state.
sleep 1
done
# A sanity check.
if [ ! -f ${CLOSED} ]; then
echo "The input file ${CLOSED} is missing!?!"
exit 1
fi
################################################
# Parse the fields in the input file name
PREFIX=$(echo ${CLOSED} | cut -d_ -f 1)
OLDEPOCH=$(echo ${CLOSED} | cut -d_ -f 2)
GRANDPARENT=$(echo ${CLOSED} | cut -d_ -f 3)
PARENT=$(echo ${CLOSED} | cut -d_ -f 4 | sed s/\\..*//g )
# Check the that input file is properly named!
CHECK=${PREFIX}_${OLDEPOCH}_${GRANDPARENT}_${PARENT}
if [ ! -f ${CHECK}.closed.root ]; then
echo Problem with input file: ${CLOSED}
echo Parsed fields: ${CHECK}
echo From original file: ${ORIGINAL}
exit 1
fi
################################################
# Generate the location of the temporary input file.
INPUT=$(echo ${CLOSED} | sed s/.closed.root/.input.root/)
# Link the final (closed) location to the temporary input name
if [ -f ${CLOSED} ]; then
ln -s ${CLOSED} ${INPUT}
fi
# Check if the input file is a real file and remove it if it isn't
if [ "x${FAKEFILE}" = "xyes" ]; then
rm ${CLOSED}
rm ${INPUT}
INPUT="none"
fi
# Generate the new epoch for the file.
NEWEPOCH=$(printf %3.3d $(( $(echo ${OLDEPOCH} | sed s/0*//) + 1)) )
# Make the temporary name for the running file
RUNNING=${PREFIX}_${NEWEPOCH}_${PARENT}_XXXXXXXX.running.root
echo "#####################################################"
echo "Command: " ${COMMAND}
echo "Original file: " ${ORIGINAL}
echo "Temporary input: " ${INPUT}
echo "Temporary output:" ${RUNNING}
COMMAND_LINE=$(echo ${COMMAND} | \
sed s%:INPUT:%${INPUT}%g | \
sed s%:OUTPUT:%${RUNNING}%g)
echo "Command Line: " ${COMMAND_LINE}
echo "Start time: " $(date)
echo "#####################################################"
#################################################
# Run the job! Don't let it fail
time ${COMMAND_LINE} || ([ -f ${RUNNING} ] && rm ${RUNNING})
echo "#####################################################"
echo "End time: " $(date)
# Check if the output file was generated. If there was no output,
# then reopen the "closed" input file.
if [ ! -f ${RUNNING} ]; then
echo Missing output file: ${RUNNING}
echo Original file: ${ORIGINAL}
if [ -f ${INPUT} ]; then
rm ${INPUT}
fi
# Move the closed file back to the original file name (but don't
# overwrite original if it exists.
if [ ! -f ${ORIGINAL} -a -f ${CLOSED} ]; then
mv ${CLOSED} ${ORIGINAL}
fi
exit 1
fi
# Find the checksum value for the new file
CHILD=$(md5sum ${RUNNING} | cut -c 1-8)
# Make the final name for the file that was just run.
OPEN=${PREFIX}_${NEWEPOCH}_${PARENT}_${CHILD}.open.root
echo "New file: " ${OPEN}
# Move the temporary output file to it's final file name.
mv ${RUNNING} ${OPEN}
# Remove the temporary input file
if [ -f ${INPUT} ]; then
rm ${INPUT}
fi