bacpipe

#!/bin/bash 

set -eo pipefail

## This is the master script for the bacterial RNA-seq processing pipeline. 
## It reports the master log, determines if it's OK to proceed, and calls all the sub-scripts. 
## Computationally challenging tasks are parallelized; Parallel wrappers 
## are named mbc_prun_*.sh. These in turn call onto individual scripts. 

GRN='\033[1;32m'
GRN2='\033[0;32m'
UL='\033[4;34m'
NC='\033[0m' # No Color

if [[ $# < 2 && $1 != "clean" && $1 != "refclean" ]] 
then
  echo
  echo "Version:"
  echo "  v0.8.0, Alexander Predeus (predeus@gmail.com), 2020" 
  echo "Synopsis:"
  echo "  Process multistrain bacterial RNA-seq data using pre-created reference."
  echo "Usage:" 
  printf "  ${GRN}bacpipe${NC} ${GRN2}<working_directory> <config> [-p CPUs] [-k key_strain]${NC}\n"
  echo 
  echo "Positional arguments:"
  echo "  <working_dir>  Directory containing sub-directories named fastqs, study_strains, and ref_strains"
  echo "  <config>       Tab-separated file listing each sample and strain tag," 
  echo "                 as well as reference strain tags (in case of multi-strain processing)" 
  echo "Options:"
  echo "  -p [X]         Number of cores for parallel execution (default '4')"
  echo "  -k [X]         (multi-strain only) Key strain: use strain's locus tag when no gene name is assigned"
  echo  
  printf "See ${UL}github.com/apredeus/multi-bacpipe${NC} for more information.\n"
  echo 
  exit 1
elif [[ $1 == "clean" ]]
then 
  echo "Cleaning up the working directory..."
  rm -rf bams exp_tables FastQC featureCounts stats strand tdfs_and_bws
  exit 1
elif [[ $1 == "refclean" ]]
then 
  echo "Cleaning up previously generated reference files..."
  rm -rf roary orthologs.tsv 
  RS=`ls ref_strains/ | grep -v "\.fa" | grep -v "\.gff"`
  SS=`ls study_strains/ | grep -v "\.fa" | grep -v "\.bed"`
  for i in $RS
  do 
    rm -rf ref_strains/$i
  done 
  for i in $SS
  do
    rm -rf study_strains/$i
  done
  exit 1
fi

echo "=================================================================================="
echo "=================================================================================="
echo "===                                                                            ==="
echo "===                             Welcome to BACPIPE!                            ==="
echo "===      Version 0.7.0 - with multi-strain and simple reference support        ==="
echo "===          2018-20 (c)  Alexander Predeus, Jay Hinton Lab, Liverpool         ==="
echo "===  For more information, visit https://github.com/apredeus/multi-bacpipe     ==="
echo "===                        Publication in preparation.                         ==="
echo "===                                                                            ==="
echo "=================================================================================="
echo "=================================================================================="
echo
echo

WDIR=""
CONFIG="" 
CPUS=""
KEYSTR=""

! getopt --test > /dev/null
if [[ ${PIPESTATUS[0]} -ne 4 ]]; then
    >&2 echo "ERROR: \"getopt --test\" failed in this environment"
    >&2 echo "Please make sure you have the most up-to-date version of getopt!" 
    exit 1
fi

! PARSED=$(getopt --options=p:k: --name "$0" -- "$@")
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
  exit 2
fi

# read getopt’s output this way to handle the quoting right:
eval set -- "$PARSED"

while true; do
    case "$1" in
        -k)
            KEYSTR="$2" 
            shift 2
            ;;
        -p)
            CPUS="$2"
            shift 2
            ;;
        --)
            shift
            break
            ;;
        *)
            echo "Programming error"
            exit 3
            ;;
    esac
done

WDIR=$1
CONFIG=$2

SDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

if [[ $WDIR == "" || $CONFIG == "" ]]
then
  >&2 echo "ERROR: You must specify a non-empty working directory and appropriate config file!"
  exit 1  
fi  

WDIR=`readlink -f $WDIR`
CONFIG=`readlink -f $CONFIG`
REFDIR=$WDIR/study_strains

if [[ $CPUS == "" ]]
then 
  echo "==> Parallel jobs will be ran on 4 cores (default)"
  CPUS=4
else 
  echo "==> Parallel jobs will be ran on $CPUS cores"
fi
  
MULTI=`grep -c "^Reference" $CONFIG || true`
if [[ $MULTI != 0 ]] 
then 
  echo "==> Initiating bacpipe run using MULTI-STRAIN workflow!"  
  echo "==> Following variables were set:"
  echo
  echo "      WDIR: $WDIR"
  echo "      SDIR: $SDIR"
  echo "    CONFIG: $CONFIG"
  echo "      CPUS: $CPUS"
  echo "    KEYSTR: $KEYSTR"
  echo
else 
  echo "==> Initiating bacpipe run using SINGLE STRAIN workflow!"  
  echo "==> Following variables were set:"
  echo
  echo "      WDIR: $WDIR"
  echo "      SDIR: $SDIR"
  echo "    CONFIG: $CONFIG"
  echo "      CPUS: $CPUS"
  echo
fi

##########################################################################################
###                                                                                    ###
###         all the parameters are set, all sanity checks done, let's roll!            ###
###                                                                                    ### 
##########################################################################################

cd $WDIR 

echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 0: Checking configuration file and reference availability."
echo 
cd $WDIR
$SDIR/script/check_config.sh $SDIR $WDIR $CONFIG
echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 1: Running FastQC."
echo
cd $WDIR/fastqs
$SDIR/script/parallel_fastqc.sh $WDIR $CPUS
echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 2: Running STAR alignment."
echo
cd $WDIR/fastqs 
$SDIR/script/parallel_star_align.sh $SDIR $WDIR $REFDIR $CONFIG $CPUS
echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 3: Making TDF and strand-specific bigWig files." 
echo
cd $WDIR/bams 
$SDIR/script/parallel_calculate_coverage.sh $SDIR $WDIR $REFDIR $CONFIG $CPUS
echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 4: Running featureCounts on all possible strand settings."
echo 
cd $WDIR/bams 
$SDIR/script/parallel_strand_quant.sh $SDIR $WDIR $REFDIR $CONFIG $CPUS
echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 5: Calculating strandedness and other statistics."
echo 
cd $WDIR/fastqs 
$SDIR/script/parallel_calculate_stats.sh $SDIR $WDIR $CONFIG $CPUS
echo 
echo "=================================================================================="
echo

cd $WDIR/stats
cat *.strand | awk 'BEGIN {min=100;max=0} {sum+=$16; if($16>max) \
{max=$16}; if($16<min) {min=$16};} END {print "Average percent of \
reads matching the coding strand: "sum/NR", lowest: "min", highest: "max}'

STRAND=`cat *.strand | awk '{sum+=$16} END {x=sum/NR; if (x<10) \
{print "RF"} else if (x>90) {print "FR"} else if (x>45 && x<55) \
{print "NONE"} else {print "WARN"}}'` 

if [[ $STRAND == "WARN" ]]
then
  echo "WARNING: something is very much off with the strand-specificity of your RNA-seq!"
  echo "         This means that average strand-specificity of the dataset does not fall into"
  echo "         one of the following intervals: [0%,10%), (45%,55%), or (90%,100%]."
  echo "         This is very irregular; please investigate why is this happening using contents"
  echo "         of <wdir>/strand and <wdir>/stats directories." 
  ## in case of weird things happening, treat as non-strand-specific
  STRAND="NONE"
else 
  echo "The strandedness of your experiment was determined to be $STRAND"
fi
cd $WDIR

echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 6: Selecting the appropriate quantification approach."
echo 
cd $WDIR/bams 
$SDIR/script/parallel_fcount_quant.sh $SDIR $WDIR $REFDIR $CONFIG $CPUS $STRAND
echo 
echo "=================================================================================="
echo

echo "==> ["`date +%H:%M:%S`"] Step 7: Making final expression tables."
echo 
cd $WDIR/featureCounts 
$SDIR/script/make_tables.sh $SDIR $WDIR $CONFIG $KEYSTR
echo 
echo "=================================================================================="
echo

## TODO: make GFF file using all the available utils 
## TODO: multiQC with appropriately formatted json!

echo "==> ["`date +%H:%M:%S`"] ALL YOUR BASES ARE BELONG TO US!!!"