From c49e4ff1e13cccd039315bd6700af56ce641042c Mon Sep 17 00:00:00 2001 From: Francesco Tabaro Date: Tue, 12 Mar 2024 17:40:31 +0100 Subject: [PATCH] fix: switch to wget and write some logging when downloading genome annotations --- workflow/scripts/download-gtf.sh | 40 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/workflow/scripts/download-gtf.sh b/workflow/scripts/download-gtf.sh index 9e11109..28ff9a6 100644 --- a/workflow/scripts/download-gtf.sh +++ b/workflow/scripts/download-gtf.sh @@ -4,31 +4,35 @@ set -e URL="${snakemake_params[url]}" +mkdir -p $(dirname ${snakemake_log}) +touch ${snakemake_log} + if [[ $URL == *.gz ]]; then TMP=$(mktemp -u --suffix .gz) + echo "$URL refers to gzipped file. Temp file: $TMP" >> ${snakemake_log} else TMP=$(mktemp -u) + echo "$URL does not refer to gzipped file. Temp file: $TMP" >> ${snakemake_log} fi -echo "Downloading to $TMP" | tee -a ${snakemake_log} +echo "Downloading to $TMP" >> ${snakemake_log} OUTPUT=${snakemake_output} mkdir -pv $(dirname $OUTPUT) -# wget -O $TMP "$URL" -curl "$URL" \ --H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0' \ --H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \ --H 'Accept-Language: en-US,en;q=0.5' \ --H 'Accept-Encoding: gzip, deflate' \ --H 'Connection: keep-alive' \ --H 'Upgrade-Insecure-Requests: 1' \ --H 'DNT: 1' \ --H 'Sec-GPC: 1' \ --H 'Pragma: no-cache' \ --H 'Cache-Control: no-cache' \ ---silent \ ---output $TMP +wget "$URL" \ +--user-agent=' Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0' \ +--header='Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \ +--header='Accept-Language: en-US,en;q=0.5' \ +--header='Accept-Encoding: gzip, deflate' \ +--header='Connection: keep-alive' \ +--header='Upgrade-Insecure-Requests: 1' \ +--header='DNT: 1' \ +--header='Sec-GPC: 1' \ +--header='Pragma: no-cache' \ +--header='Cache-Control: no-cache' \ +--quiet \ +--output-document="$TMP" sleep $(( $RANDOM % 10 + 2 )) @@ -37,10 +41,10 @@ if [[ $URL == *.gz ]] && [[ ! $OUTPUT == *.gz ]]; then sleep $(( $RANDOM % 10 + 2 )) fi -if grep -v '#' "${TMP%.gz}" | head -n 1 | grep -q '^chr' | tee -a ${snakemake_log}; then - echo "Mv'ing to $OUTPUT" | tee -a ${snakemake_log} +if grep -v '#' "${TMP%.gz}" | head -n 1 | grep -q '^chr' >> ${snakemake_log}; then + echo "Mv'ing to $OUTPUT" >> ${snakemake_log} mv $TMP $OUTPUT else - echo "Adding \"chr\" to first column, then move to $OUTPUT" | tee -a ${snakemake_log} + echo "Adding \"chr\" to first column, then move to $OUTPUT" >> ${snakemake_log} awk -F "\t" -v OFS="\t" '!/^#/{print "chr"$0}/#/{print}' ${TMP%.gz} > $OUTPUT fi