From 43a86188759b6435465c856ad9b3e64433043de3 Mon Sep 17 00:00:00 2001 From: Francesco Tabaro Date: Tue, 18 Jul 2023 15:51:51 +0200 Subject: [PATCH] lint --- env/pandas.yml | 4 + workflow/Snakefile | 137 +---------- workflow/include/common.smk | 288 +++++++++++++++++++++++ workflow/include/coverage_tRNA.smk | 28 +-- workflow/include/deseq2.smk | 72 +++--- workflow/include/download-references.smk | 10 +- workflow/include/fastqc.smk | 48 +--- workflow/include/make_bw.smk | 6 +- workflow/include/picard_markdup.smk | 22 +- workflow/include/salmonTE.smk | 92 ++------ workflow/include/star.smk | 25 +- workflow/include/starTE.smk | 23 +- workflow/include/trim_single.smk | 57 +---- workflow/scripts/edit_condition_file.py | 40 ++++ 14 files changed, 432 insertions(+), 420 deletions(-) create mode 100644 env/pandas.yml create mode 100644 workflow/include/common.smk create mode 100644 workflow/scripts/edit_condition_file.py diff --git a/env/pandas.yml b/env/pandas.yml new file mode 100644 index 0000000..973e071 --- /dev/null +++ b/env/pandas.yml @@ -0,0 +1,4 @@ +channels: + - anaconda +dependencies: + - pandas diff --git a/workflow/Snakefile b/workflow/Snakefile index 61d178a..1608129 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -29,129 +29,11 @@ container: "docker://condaforge/mambaforge:latest" configfile: "config/config.yaml" +############################# +## INCLUDE COMMON FUNCTIONS +############################# -############ -## HELPERS -############ - - -def giga_to_byte(g): - g = g - 2 # leave some memory to other processes - return g * (1024**3) - - -def get_filename(link, decompress=False, stem=False): - # sometimes we use this function to get filenames of paths instead of links. - # convert back Path to str. - if isinstance(link, Path): - link = str(link) - # parse - p = urlparse(link) - - # cast to Path and get name attribute - if stem: - basename = Path(p.path).stem - else: - basename = Path(p.path).name - - # remove .gz - if decompress and str(basename).endswith(".gz"): - basename = basename.replace(".gz", "") - - return basename - - -def get_samples(wildcards, samples): - if wildcards.serie in samples["single"]: - s = samples["single"][wildcards.serie] - else: - s = samples["paired"][wildcards.serie] - return s - -def get_bw(wildcards): - """Builds bigwig paths for rule all""" - o = [] - for lib in library_names_single + library_names_paired: - if lib in samples["single"].keys(): - s = samples["single"][lib] - else: - s = samples["paired"][lib] - o += expand(star_folder.joinpath("{serie}", "{sample}.bw"), serie=lib, sample=s) - return o - - -def get_star_input(wildcards): - """Builds input paths for STAR alignment testing if a library is single-end or paired-end""" - if wildcards.serie in library_names_single: - for ext in supported_extensions: - infile = trim_reads_folder.joinpath( - wildcards.serie, f"{wildcards.sample}.{ext}" - ) - if os.path.exists(infile): - break - else: - for ext in supported_extensions: - infile = [ - trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_1.{ext}"), - trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_2.{ext}"), - ] - if all([os.path.exists(f) for f in infile]): - break - return infile - - -def get_params(wildcards, key): - """Returns the value of a specific key for the current serie""" - params = "" - for lib in config["sequencing_libraries"]: - if lib["name"] == wildcards.serie: - params = lib[key] - return params - - -def get_sample_sheet(wildcards): - """Returns path to sample sheet for current serie""" - return get_params(wildcards, "sample_sheet") - - -def get_fastq(wildcards): - if wildcards.serie in library_names_single: - for ext in supported_extensions: - candidate = raw_reads_folder.joinpath( - wildcards.serie, f"{wildcards.sample}.{ext}" - ) - if os.path.exists(candidate): - return candidate - raise ValueError( - f"Could not find FastQ file. Check your naming. Supported extensions: {supported_extensions}" - ) - return "" - - -def get_fastq_paired(wildcards): - if wildcards.serie in library_names_paired: - for ext in supported_extensions: - for suffix in supported_suffixes: - candidate1 = raw_reads_folder.joinpath( - wildcards.serie, f"{wildcards.sample}{suffix[0]}.{ext}" - ) - candidate2 = raw_reads_folder.joinpath( - wildcards.serie, f"{wildcards.sample}{suffix[1]}.{ext}" - ) - if os.path.exists(candidate1) and os.path.exists(candidate2): - return {"m1": candidate1, "m2": candidate2} - raise ValueError( - f"Could not find FastQ files. Check your naming.\nPaired-end suffixed: {supported_suffixes}.\nSupported extensions: {supported_extensions}" - ) - return "" - - -def mkdir(p: Path, verbose=False): - if not p.exists(): - p.mkdir(parents=True, exist_ok=True) - if verbose: - print("Created {}".format(p)) - +include: "include/common.smk" ####################### ## DEFINE VARIABLES @@ -208,9 +90,6 @@ gtf_path = references_folder.joinpath( rmsk_path = Path(config["genome"]["rmsk_path"]) config["genome"]["rmsk_link"] = None -# rmsk_path = references_folder.joinpath( -# get_filename(config["genome"]["rmsk_link"], decompress=False) -# ) rmsk_bed = Path(str(rmsk_path).replace("gtf", "bed")) gaf_path = references_folder.joinpath( @@ -284,9 +163,10 @@ onstart: print_json(json.dumps(samples)) -############ -## RULES -############ + +############# +## WORKFLOW +############# wildcard_constraints: @@ -295,6 +175,7 @@ wildcard_constraints: method="multihit|random", + include: "include/download-references.smk" include: "include/fastqc.smk" include: "include/trim_single.smk" diff --git a/workflow/include/common.smk b/workflow/include/common.smk new file mode 100644 index 0000000..843136e --- /dev/null +++ b/workflow/include/common.smk @@ -0,0 +1,288 @@ +def giga_to_byte(g): + g = g - 2 # leave some memory to other processes + return g * (1024**3) + + +def get_filename(link, decompress=False, stem=False): + # sometimes we use this function to get filenames of paths instead of links. + # convert back Path to str. + if isinstance(link, Path): + link = str(link) + # parse + p = urlparse(link) + + # cast to Path and get name attribute + if stem: + basename = Path(p.path).stem + else: + basename = Path(p.path).name + + # remove .gz + if decompress and str(basename).endswith(".gz"): + basename = basename.replace(".gz", "") + + return basename + + +def get_samples(wildcards, samples): + if wildcards.serie in samples["single"]: + s = samples["single"][wildcards.serie] + else: + s = samples["paired"][wildcards.serie] + return s + + +def get_bw(wildcards): + """Builds bigwig paths for rule all""" + o = [] + for lib in library_names_single + library_names_paired: + if lib in samples["single"].keys(): + s = samples["single"][lib] + else: + s = samples["paired"][lib] + o += expand(star_folder.joinpath("{serie}", "{sample}.bw"), serie=lib, sample=s) + return o + + +def get_star_input(wildcards): + """Builds input paths for STAR alignment testing if a library is single-end or paired-end""" + if wildcards.serie in library_names_single: + for ext in supported_extensions: + infile = trim_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}.{ext}" + ) + if os.path.exists(infile): + break + else: + for ext in supported_extensions: + infile = [ + trim_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}_1.{ext}" + ), + trim_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}_2.{ext}" + ), + ] + if all([os.path.exists(f) for f in infile]): + break + return infile + + +def get_params(wildcards, key): + """Returns the value of a specific key for the current serie""" + params = "" + for lib in config["sequencing_libraries"]: + if lib["name"] == wildcards.serie: + params = lib[key] + return params + + +def get_sample_sheet(wildcards): + """Returns path to sample sheet for current serie""" + return get_params(wildcards, "sample_sheet") + + +def get_fastq(wildcards): + if wildcards.serie in library_names_single: + for ext in supported_extensions: + candidate = raw_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}.{ext}" + ) + if os.path.exists(candidate): + return candidate + raise ValueError( + f"Could not find FastQ file. Check your naming. Supported extensions: {supported_extensions}" + ) + return "" + + +def get_fastq_paired(wildcards): + if wildcards.serie in library_names_paired: + for ext in supported_extensions: + for suffix in supported_suffixes: + candidate1 = raw_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}{suffix[0]}.{ext}" + ) + candidate2 = raw_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}{suffix[1]}.{ext}" + ) + if os.path.exists(candidate1) and os.path.exists(candidate2): + return {"m1": candidate1, "m2": candidate2} + raise ValueError( + f"Could not find FastQ files. Check your naming.\nPaired-end suffixed: {supported_suffixes}.\nSupported extensions: {supported_extensions}" + ) + return "" + + +def mkdir(p: Path, verbose=False): + if not p.exists(): + p.mkdir(parents=True, exist_ok=True) + if verbose: + print("Created {}".format(p)) + + +def get_tRNA_annotation_file(wildcards): + checkpoint_output = checkpoints.download_gtRNAdb.get(**wildcards).output[0] + bed_filename = glob_wildcards(os.path.join(checkpoint_output, "{x}.bed")).x[0] + return tRNA_annotation_dir.joinpath(f"{bed_filename}.bed") + + +def get_trna_coverage(wildcards): + return expand( + trna_coverage_folder.joinpath("{{serie}}", "{sample}.bed"), + sample=get_samples(wildcards, samples), + ) + + +def get_deseq2_test(wildcards): + deseq2_params = get_params(wildcards, "deseq2") + return deseq2_params["test"] + + +def get_deseq2_variable(wildcards): + deseq2_params = get_params(wildcards, "deseq2") + return deseq2_params["variable"] + + +def get_markdup_bam(wildcards): + return expand( + markdup_folder.joinpath("{{serie}}/{sample}.markdup.bam"), + sample=get_samples(wildcards, samples), + ) + + +def get_markdup_fastqc(wildcards): + return expand( + fastqc_markdup_folder.joinpath("{{serie}}", "{sample}.markdup_fastqc.html"), + sample=get_samples(wildcards, samples), + ) + + +def get_salmonTE_quant_input(wildcards): + if wildcards.serie in library_names_single: + s = samples["single"][wildcards.serie] + else: + s = [ + f"{s}{mate}" + for s in samples["paired"][wildcards.serie] + for mate in ["_1", "_2"] + ] + for extension in supported_extensions: + candidates = expand( + raw_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}.{extension}"), + sample=s, + ) + if all([os.path.exists(f) for f in candidates]): + break + + return candidates + + +def set_salmonTE_genome(): + genome_label = config["genome"]["label"][:2] + salmon_label = "" + if genome_label == "mm": + salmon_label = "mm" + elif genome_label == "hg": + salmon_label = "hs" + elif genome_label == "dr": + salmon_label = "dr" + elif genome_label == "dm": + salmon_label = "dm" + else: + raise ValueError(f'Unsupported genome label: {config["genome"]["label"]}') + return salmon_label + + +def get_star_stats(wildcards): + return expand( + star_folder.joinpath("{{serie}}/{sample}.Log.final.out"), + sample=get_samples(wildcards, samples), + ) + + +def get_star_fastqc(wildcards): + return expand( + fastqc_star_folder.joinpath( + "{{serie}}", "{sample}.Aligned.sortedByCoord.out_fastqc.html" + ), + sample=get_samples(wildcards, samples), + ) + + +def get_trimmed_fastq(wildcards): + if wildcards.serie in library_names_single: + return trim_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}.fastq.gz" + ) + elif wildcards.serie in library_names_paired: + return [ + trim_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}_1.fastq.gz" + ), + trim_reads_folder.joinpath( + wildcards.serie, f"{wildcards.sample}_2.fastq.gz" + ), + ] + else: + raise ValueError(f"Could not determine protocol type for {wildcards.serie}") + + +def get_trimmomatic_stats(wildcards): + ret = expand( + trim_reads_folder.joinpath(wildcards.serie, "{sample}.stats.txt"), + sample=get_samples(wildcards, samples), + ) + return ret + + +def get_trimmed_fastqc(wildcards): + s = wildcards.serie + if s in library_names_paired: + ret = [ *expand( + fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_1_fastqc.html"), + sample=get_samples(wildcards, samples), + ), *expand( + fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_2_fastqc.html"), + sample=get_samples(wildcards, samples), + )] + else: + ret = expand( + fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"), + sample=get_samples(wildcards, samples), + ) + return ret + + +def get_fastqc(wildcards): + s = get_samples(wildcards, samples) + if wildcards.serie in library_names_single: + ret = expand( + fastqc_raw_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"), + sample=s, + ) + else: + for m in supported_suffixes: + for ext in supported_extensions: + m1 = expand( + os.path.join( + raw_reads_folder, wildcards.serie, f"{wildcards.sample}{m[0]}.{ext}" + ), + sample=s, + ) + m2 = expand( + os.path.join( + raw_reads_folder, wildcards.serie, f"{wildcards.sample}{m[1]}.{ext}" + ), + sample=s, + ) + + if all([os.path.exists(p) for p in m1 + m2]): + ret = [ + *expand(fastqc_raw_folder.joinpath(wildcards.serie, f"{wildcards.sample}{m[0]}_fastqc.html" + ), + sample=s), + *expand(fastqc_raw_folder.joinpath(wildcards.serie, f"{wildcards.sample}{m[1]}_fastqc.html"), + sample=s)] + + return ret diff --git a/workflow/include/coverage_tRNA.smk b/workflow/include/coverage_tRNA.smk index 97479b8..f736339 100644 --- a/workflow/include/coverage_tRNA.smk +++ b/workflow/include/coverage_tRNA.smk @@ -15,21 +15,16 @@ rule mk_genome_tsv: log_folder.joinpath("mk_genome_tsv/{serie}/{sample}.log"), shell: """ - samtools view -H {input[0]} | grep SQ | cut -f 2,3 | sed -r 's/(SN|LN)://g' | tr " " "\t" > {output} + samtools view -H {input} | grep SQ | cut -f 2,3 | sed -r 's/(SN|LN)://g' | tr " " "\t" > {output} """ -def get_tRNA_annotation_file(wildcards): - checkpoint_output = checkpoints.download_gtRNAdb.get(**wildcards).output[0] - bed_filename = glob_wildcards(os.path.join(checkpoint_output, "{x}.bed")).x[0] - return tRNA_annotation_dir.joinpath(f"{bed_filename}.bed") - rule coverage_trna: input: - star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam"), - star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam.bai"), - star_folder.joinpath("{serie}", "{sample}.genome"), - get_tRNA_annotation_file, + bam=star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam"), + bai=star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam.bai"), + genome=star_folder.joinpath("{serie}", "{sample}.genome"), + annotation=get_tRNA_annotation_file, output: trna_coverage_folder.joinpath("{serie}", "{sample}.bed"), conda: @@ -40,21 +35,14 @@ rule coverage_trna: """ set -x SORTED=$(mktemp) - JOINED=$(join -o 1.1 <(awk '{{print $1}}' {input[3]} | sort -u) <(awk '{{print $1}}' {input[2]} | sort -u)) - awk -v j="$JOINED" '$1~j' {input[3]} | grep -v 'chrUn' | bedtools sort -faidx {input[2]} -i - > $SORTED + JOINED=$(join -o 1.1 <(awk '{{print $1}}' {input.genome} | sort -u) <(awk '{{print $1}}' {input.bai} | sort -u)) + awk -v j="$JOINED" '$1~j' {input.genome} | grep -v 'chrUn' | bedtools sort -faidx {input.bai} -i - > $SORTED - bedtools coverage -g {input[2]} -sorted -a $SORTED -b {input[0]} | sort -k1,1 -k2,2n > {output} + bedtools coverage -g {input.genome} -sorted -a $SORTED -b {input.bam} | sort -k1,1 -k2,2n > {output} rm $SORTED """ -def get_trna_coverage(wildcards): - return expand( - trna_coverage_folder.joinpath("{{serie}}", "{sample}.bed"), - sample=get_samples(wildcards, samples), - ) - - rule build_trna_coverage_matrix: input: get_trna_coverage, diff --git a/workflow/include/deseq2.smk b/workflow/include/deseq2.smk index fc79da7..3261f15 100644 --- a/workflow/include/deseq2.smk +++ b/workflow/include/deseq2.smk @@ -11,16 +11,6 @@ rule subset_gtf: "../scripts/subset_gtf_v1.R" -def get_deseq2_test(wildcards): - deseq2_params = get_params(wildcards, "deseq2") - return deseq2_params["test"] - - -def get_deseq2_variable(wildcards): - deseq2_params = get_params(wildcards, "deseq2") - return deseq2_params["variable"] - - rule deseq2: input: star_flag=star_folder.joinpath("{serie}.done"), @@ -51,34 +41,34 @@ rule deseq2: "../scripts/deseq2_v1.R" -rule deseq2_report: - input: - analysis_folder.joinpath("deseq2-{serie}.done"), - gaf_path, - ann_path=str(rdata_folder.joinpath("deseq2/ann.rds")), - output: - touch(analysis_folder.joinpath("deseq2-report-{serie}.done")), - # notebooks_folder.joinpath("{serie}/%s.html"%get_filename(deseq2_notebook_input_path, stem=True)) - params: - dds_path=str(rdata_folder.joinpath("deseq2/{serie}/dds.rds")), - # ann_path = str(rdata_folder.joinpath("deseq2/ann.rds")), - notebook_path=deseq2_notebook_input_path, - # root_path = deseq2_working_directory, - output_dir=notebooks_folder, - annotation_type=config["genome"]["annotation_type"], - conda: - "../../env/R.yml" - log: - log_folder.joinpath("R/{serie}/deseq2_report.log"), - shell: - """ - set -x - if [ -f {params.dds_path} ]; then - OUTPUT_DIR="{params.output_dir}/{wildcards.serie}" - R --no-echo --vanilla \ - -e "rmarkdown::render('{params.notebook_path}', output_dir = '$OUTPUT_DIR', output_format = 'html_document', params = list(output_dir = '$OUTPUT_DIR', ann_path = '{input.ann_path}', dds_path = '{params.dds_path}', annotation_type = '{params.annotation_type}', gaf_path = '{input[1]}'))" |& \ - tee {log} - else - exit 0 - fi - """ +# rule deseq2_report: +# input: +# analysis_folder.joinpath("deseq2-{serie}.done"), +# gaf_path, +# ann_path=str(rdata_folder.joinpath("deseq2/ann.rds")), +# output: +# touch(analysis_folder.joinpath("deseq2-report-{serie}.done")), +# # notebooks_folder.joinpath("{serie}/%s.html"%get_filename(deseq2_notebook_input_path, stem=True)) +# params: +# dds_path=str(rdata_folder.joinpath("deseq2/{serie}/dds.rds")), +# # ann_path = str(rdata_folder.joinpath("deseq2/ann.rds")), +# notebook_path=deseq2_notebook_input_path, +# # root_path = deseq2_working_directory, +# output_dir=notebooks_folder, +# annotation_type=config["genome"]["annotation_type"], +# conda: +# "../../env/R.yml" +# log: +# log_folder.joinpath("R/{serie}/deseq2_report.log"), +# shell: +# """ +# set -x +# if [ -f {params.dds_path} ]; then +# OUTPUT_DIR="{params.output_dir}/{wildcards.serie}" +# R --no-echo --vanilla \ +# -e "rmarkdown::render('{params.notebook_path}', output_dir = '$OUTPUT_DIR', output_format = 'html_document', params = list(output_dir = '$OUTPUT_DIR', ann_path = '{input.ann_path}', dds_path = '{params.dds_path}', annotation_type = '{params.annotation_type}', gaf_path = '{input[1]}'))" |& \ +# tee {log} +# else +# exit 0 +# fi +# """ diff --git a/workflow/include/download-references.smk b/workflow/include/download-references.smk index af8901a..7c452e6 100644 --- a/workflow/include/download-references.smk +++ b/workflow/include/download-references.smk @@ -34,7 +34,6 @@ rule download_genome_annotation_file: "../scripts/download-gtf.sh" - rule download_repeatmasker_annotation_file: output: rmsk_path, @@ -60,7 +59,9 @@ checkpoint download_gtRNAdb: output: directory(tRNA_annotation_dir), params: - url=config["genome"]["gtrnadb_url"] + url=config["genome"]["gtrnadb_url"], + log: + log_folder.joinpath("download/gtrnadb.log"), conda: "../../env/wget.yml" shell: @@ -68,10 +69,11 @@ checkpoint download_gtRNAdb: mkdir -p {output} cd {output} F=$(basename {params.url}) - wget -q {params.url} - tar xf $F + wget -q {params.url} |& tee {log} + tar xvf $F |& tee -a {log} """ + rule download_gaf_file: output: gaf_path, diff --git a/workflow/include/fastqc.smk b/workflow/include/fastqc.smk index b19149b..354d7d9 100644 --- a/workflow/include/fastqc.smk +++ b/workflow/include/fastqc.smk @@ -2,7 +2,7 @@ rule fastqc_raw: input: - get_fastq + get_fastq, output: fastqc_raw_folder.joinpath("{serie}", "{sample}_fastqc.zip"), fastqc_raw_folder.joinpath("{serie}", "{sample}_fastqc.html"), @@ -19,9 +19,10 @@ rule fastqc_raw: fastqc -t {threads} -noextract -o {params.fastqc_folder} {input} """ + use rule fastqc_raw as fastqc_raw_pe with: input: - unpack(get_fastq_paired) + unpack(get_fastq_paired), output: fastqc_raw_folder.joinpath("{serie}", "{sample}_1_fastqc.zip"), fastqc_raw_folder.joinpath("{serie}", "{sample}_1_fastqc.html"), @@ -33,49 +34,6 @@ use rule fastqc_raw as fastqc_raw_pe with: log_folder.joinpath("fastqc/{serie}/{sample}.log"), -# rule fastqc_raw_pe: -# input: -# unpack(get_fastq_paired) -# output: -# fastqc_raw_folder.joinpath("{serie}", "{sample}_1_sequence.fq.gz_fastqc.zip"), -# fastqc_raw_folder.joinpath("{serie}", "{sample}_1_sequence.fq.gz_fastqc.html"), -# fastqc_raw_folder.joinpath("{serie}", "{sample}_2_sequence.fq.gz_fastqc.zip"), -# fastqc_raw_folder.joinpath("{serie}", "{sample}_2_sequence.fq.gz_fastqc.html"), -# params: -# fastqc_folder=fastqc_raw_folder, -# threads: 2 -# conda: -# "../../env/qc.yml" -# log: -# log_folder.joinpath("fastqc/{serie}/{sample}.log"), -# shell: -# """ -# set -x -# fastqc -t {threads} -noextract -o {params.fastqc_folder}/{wildcards.serie} {input.m1} {input.m2} -# """ - - -def get_fastqc(wildcards): - s = get_samples(wildcards, samples) - if wildcards.serie in library_names_single: - ret = expand( - fastqc_raw_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"), - sample=s, - ) - else: - for m in supported_suffixes: - for ext in supported_extensions: - m1 = expand(os.path.join(raw_reads_folder, wildcards.serie, "{sample}" + f"{m[0]}.{ext}"), sample = s) - m2 = expand(os.path.join(raw_reads_folder, wildcards.serie, "{sample}" + f"{m[1]}.{ext}"), sample = s) - - if all([os.path.exists(p) for p in m1 + m2]): - ret = expand( - fastqc_raw_folder.joinpath(wildcards.serie, "{sample}" + f"{m[0]}_fastqc.html"), sample=s - ) + expand( - fastqc_raw_folder.joinpath(wildcards.serie, "{sample}" + f"{m[1]}_fastqc.html"), sample=s - ) - - return ret rule multiqc_raw: diff --git a/workflow/include/make_bw.smk b/workflow/include/make_bw.smk index a1ed0da..3b9c842 100644 --- a/workflow/include/make_bw.smk +++ b/workflow/include/make_bw.smk @@ -1,7 +1,7 @@ rule coverage: input: - star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam"), - star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam.bai"), + bam=star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam"), + bai=star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam.bai"), output: star_folder.joinpath("{serie}", "{sample}.bw"), conda: @@ -14,7 +14,7 @@ rule coverage: shell: """ - bamCoverage -b {input[0]} \ + bamCoverage -b {input.bam} \ -o {output} -of bigwig \ -p {threads} \ --effectiveGenomeSize 2652783500 \ diff --git a/workflow/include/picard_markdup.smk b/workflow/include/picard_markdup.smk index 4523c0f..fea12de 100644 --- a/workflow/include/picard_markdup.smk +++ b/workflow/include/picard_markdup.smk @@ -4,8 +4,8 @@ rule picard_markdup: input: star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam"), output: - markdup_folder.joinpath("{serie}/{sample}.markdup.bam"), - markdup_folder.joinpath("{serie}/{sample}.markdup.stats.txt"), + bam=markdup_folder.joinpath("{serie}/{sample}.markdup.bam"), + stats=markdup_folder.joinpath("{serie}/{sample}.markdup.stats.txt"), log: log_folder.joinpath("picard/{serie}/{sample}.log"), threads: 2 @@ -17,8 +17,8 @@ rule picard_markdup: picard MarkDuplicates \ I={input} \ - O={output[0]} \ - M={output[1]} |& \ + O={output.bam} \ + M={output.stats} |& \ tee {log} """ @@ -49,20 +49,6 @@ rule fastqc_markdup: """ -def get_markdup_bam(wildcards): - return expand( - markdup_folder.joinpath("{{serie}}/{sample}.markdup.bam"), - sample=get_samples(wildcards, samples), - ) - - -def get_markdup_fastqc(wildcards): - return expand( - fastqc_markdup_folder.joinpath("{{serie}}", "{sample}.markdup_fastqc.html"), - sample=get_samples(wildcards, samples), - ) - - rule multiqc_markdup: input: get_markdup_bam, diff --git a/workflow/include/salmonTE.smk b/workflow/include/salmonTE.smk index f32ad34..b5c24c7 100644 --- a/workflow/include/salmonTE.smk +++ b/workflow/include/salmonTE.smk @@ -9,86 +9,24 @@ rule edit_condition_file: output: touch(data_folder.joinpath("salmonTE/quant/{serie}/edit_condition.done")), params: - variable=lambda wildcards: get_deseq2_variable(wildcards) + variable=lambda wildcards: get_deseq2_variable(wildcards), log: log_folder.joinpath("salmonTE/{serie}/edit_condition.log"), - run: - import pandas as pd - from pathlib import Path + conda: + "../../env/pandas.yml" + script: + "../scripts/edit_condition_file.py" - serie_folder = Path(input[0]) - sample_sheet_path = Path(input[1]) - - salmon_condition_sheet = pd.read_csv(serie_folder.joinpath("condition.csv")) - salmon_condition_sheet = salmon_condition_sheet.set_index("SampleID") - - sample_sheet = pd.read_csv(sample_sheet_path) - - if "filename" in sample_sheet.columns: - sample_sheet = sample_sheet.set_index("filename") - else: - if all(sample_sheet.filename_1.apply(lambda x: x.endswith("_sequence"))): - sample_sheet["tmp"] = sample_sheet.filename_1.apply( - lambda x: x.split("_sequence")[0] - ) - sample_sheet = sample_sheet.set_index("tmp") - else: - sample_sheet = sample_sheet.set_index("name") - - sample_sheet = sample_sheet.reindex(index=salmon_condition_sheet.index) - - joined = sample_sheet.join(salmon_condition_sheet) - - levels = joined[params.variable].tolist() - levels = list(set(levels)) - control_level = levels[0] - - joined["condition"] = joined.apply( - lambda row: "control" if row[params.variable] == control_level else "treatment", axis=1 - ) - joined = joined.reset_index() - - out = joined[["SampleID", "condition"]] - out.to_csv(serie_folder.joinpath("condition.csv"), index=False) - - -def get_salmonTE_quant_input(wildcards): - if wildcards.serie in library_names_single: - s = samples["single"][wildcards.serie] - else: - s = [ f"{s}{mate}" for s in samples["paired"][wildcards.serie] for mate in ["_1", "_2"] ] - for extension in supported_extensions: - candidates = expand(raw_reads_folder.joinpath(wildcards.serie, "{sample}" + f".{extension}"), sample = s) - if all([os.path.exists(f) for f in candidates]): - break - - return candidates - -def set_salmonTE_genome(): - genome_label = config["genome"]["label"][:2] - salmon_label = "" - if genome_label == "mm": - salmon_label = "mm" - elif genome_label == "hg": - salmon_label = "hs" - elif genome_label == "dr": - salmon_label = "dr" - elif genome_label == "dm": - salmon_label = "dm" - else: - raise ValueError(f'Unsupported genome label: {config["genome"]["label"]}') - return salmon_label checkpoint salmonTE_quant: input: - get_salmonTE_quant_input + get_salmonTE_quant_input, output: - directory(salmonTE_folder.joinpath("quant/{serie}")), - salmonTE_folder.joinpath("quant/{serie}/EXPR.csv"), - salmonTE_folder.joinpath("quant/{serie}/MAPPING_INFO.csv"), - salmonTE_folder.joinpath("quant/{serie}/clades.csv"), - salmonTE_folder.joinpath("quant/{serie}/condition.csv"), - + outfolder=directory(salmonTE_folder.joinpath("quant/{serie}")), + expr=salmonTE_folder.joinpath("quant/{serie}/EXPR.csv"), + mapping_info=salmonTE_folder.joinpath("quant/{serie}/MAPPING_INFO.csv"), + clades=salmonTE_folder.joinpath("quant/{serie}/clades.csv"), + condition=salmonTE_folder.joinpath("quant/{serie}/condition.csv"), params: # 13/10/2020 available references: hs mm dr dm # https://github.com/LiuzLab/SalmonTE#running-the-quant-mode-to-collect-te-expressions @@ -115,7 +53,7 @@ checkpoint salmonTE_quant: python /opt/SalmonTE/SalmonTE.py quant \ --reference={params.reference_genome} \ - --outpath={output[0]} \ + --outpath={output.outfolder} \ --num_threads={threads} $I |& \ tee {log} """ @@ -123,8 +61,8 @@ checkpoint salmonTE_quant: rule salmonTE_test: input: - salmonTE_folder.joinpath("quant/{serie}"), - salmonTE_folder.joinpath("quant/{serie}/edit_condition.done"), + infolder=salmonTE_folder.joinpath("quant/{serie}"), + condition_file=salmonTE_folder.joinpath("quant/{serie}/edit_condition.done"), output: directory(salmonTE_folder.joinpath("de_analysis/{serie}")), log: @@ -134,7 +72,7 @@ rule salmonTE_test: shell: """ python /opt/SalmonTE/SalmonTE.py test \ - --inpath={input[0]} \ + --inpath={input.infolder} \ --outpath={output} \ --tabletype=csv \ --figtype=png \ diff --git a/workflow/include/star.smk b/workflow/include/star.smk index f2f145a..f818ad2 100644 --- a/workflow/include/star.smk +++ b/workflow/include/star.smk @@ -32,7 +32,7 @@ rule star_genome_preparation: rule star: input: - get_star_input, + bam=get_star_input, star_index_folder=references_folder.joinpath("STAR"), genome_annotation_file=gtf_path, output: @@ -60,12 +60,6 @@ rule star: set -x TMP_FOLDER=$(mktemp -u -p {params.tmp_folder}) - if [ {params.libtype} == "SINGLE" ]; then - INPUTARG="{input[0]}" - else - INPUTARG="{input[0]} {input[1]}" - fi - STAR --quantMode TranscriptomeSAM GeneCounts \ --outTmpDir $TMP_FOLDER \ --outSAMtype BAM SortedByCoordinate \ @@ -76,7 +70,7 @@ rule star: --genomeDir {input.star_index_folder} \ --readFilesCommand zcat \ --outFileNamePrefix {params.alignments_folder}/{wildcards.serie}/{wildcards.sample}. \ - --readFilesIn $INPUTARG \ + --readFilesIn {input.bam} \ --limitBAMsortRAM {params.mem_mb} \ --genomeLoad NoSharedMemory \ --outSAMunmapped Within \ @@ -124,21 +118,6 @@ rule fastqc_star: """ -def get_star_stats(wildcards): - return expand( - star_folder.joinpath("{{serie}}/{sample}.Log.final.out"), - sample=get_samples(wildcards, samples), - ) - - -def get_star_fastqc(wildcards): - return expand( - fastqc_star_folder.joinpath( - "{{serie}}", "{sample}.Aligned.sortedByCoord.out_fastqc.html" - ), - sample=get_samples(wildcards, samples), - ) - rule verify_star: input: lambda wildcards: expand( diff --git a/workflow/include/starTE.smk b/workflow/include/starTE.smk index 15e06f8..9b2075b 100644 --- a/workflow/include/starTE.smk +++ b/workflow/include/starTE.smk @@ -1,6 +1,6 @@ rule starTE_random: input: - get_star_input, + bam=get_star_input, star_index_folder=references_folder.joinpath("STAR"), output: starTE_folder.joinpath("{serie}/random/{sample}.Aligned.out.bam"), @@ -23,12 +23,6 @@ rule starTE_random: echo 'tmp: $TMP_FOLDER' sleep 10 - if [ {params.libtype} == "SINGLE" ]; then - INPUTARG="{input[0]}" - else - INPUTARG="{input[0]} {input[1]}" - fi - STAR \ --outSAMtype BAM Unsorted \ --runMode alignReads \ @@ -53,7 +47,7 @@ rule starTE_random: --runThreadN {threads} \ --genomeDir {input.star_index_folder} \ --outFileNamePrefix {params.alignments_folder}/{wildcards.serie}/random/{wildcards.sample}. \ - --readFilesIn $INPUTARG \ + --readFilesIn {input.bam} \ --limitBAMsortRAM {params.mem_mb} \ --outBAMcompression -1 |& \ tee {log} @@ -61,6 +55,7 @@ rule starTE_random: [[ -d $TMP_FOLDER ]] && rm -r $TMP_FOLDER || exit 0 """ + rule featureCounts_random: input: bam=lambda wildcards: expand( @@ -84,6 +79,7 @@ rule featureCounts_random: featureCounts -M -F GTF -T {threads} -s 0 -a {input.annotation} -o {output} {input.bam} """ + rule starTE_multihit: input: get_star_input, @@ -108,13 +104,7 @@ rule starTE_multihit: TMP_FOLDER=$(mktemp -u -p {params.tmp_folder}) echo 'tmp: $TMP_FOLDER' sleep 10 - - if [ {params.libtype} == "SINGLE" ]; then - INPUTARG="{input[0]}" - else - INPUTARG="{input[0]} {input[1]}" - fi - + STAR \ --outSAMtype BAM Unsorted \ --runMode alignReads \ @@ -137,7 +127,7 @@ rule starTE_multihit: --genomeDir {input.star_index_folder} \ --readFilesCommand zcat \ --outFileNamePrefix {params.alignments_folder}/{wildcards.serie}/multihit/{wildcards.sample}. \ - --readFilesIn $INPUTARG \ + --readFilesIn {input.bam} \ --limitBAMsortRAM {params.mem_mb} \ --genomeLoad NoSharedMemory \ --outBAMcompression -1 |& \ @@ -146,6 +136,7 @@ rule starTE_multihit: [[ -d $TMP_FOLDER ]] && rm -r $TMP_FOLDER || exit 0 """ + rule featureCounts_multihit: input: bam=lambda wildcards: expand( diff --git a/workflow/include/trim_single.smk b/workflow/include/trim_single.smk index b20ba79..dc3e2e3 100644 --- a/workflow/include/trim_single.smk +++ b/workflow/include/trim_single.smk @@ -1,10 +1,13 @@ ruleorder: trimmomatic_pe > trimmomatic_se + rule trimmomatic_pe: wildcard_constraints: - serie="|".join(library_names_paired if len(library_names_paired) > 0 else ["none"]), + serie="|".join( + library_names_paired if len(library_names_paired) > 0 else ["none"] + ), input: - unpack(get_fastq_paired) + unpack(get_fastq_paired), output: paired1=trim_reads_folder.joinpath("{serie}", "{sample}_1.fastq.gz"), paired2=trim_reads_folder.joinpath("{serie}", "{sample}_2.fastq.gz"), @@ -30,11 +33,14 @@ rule trimmomatic_pe: {params} |& tee {output.stats} """ + rule trimmomatic_se: wildcard_constraints: - serie="|".join(library_names_single if len(library_names_single) > 0 else ["none"]), + serie="|".join( + library_names_single if len(library_names_single) > 0 else ["none"] + ), input: - get_fastq + get_fastq, output: fastq=trim_reads_folder.joinpath("{serie}", "{sample}.fastq.gz"), summary=trim_reads_folder.joinpath("{serie}", "{sample}.summary.txt"), @@ -57,25 +63,11 @@ rule trimmomatic_se: """ - - -def get_trimmed_fastq(wildcards): - if wildcards.serie in library_names_single: - return trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}.fastq.gz") - elif wildcards.serie in library_names_paired: - return [ - trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_1.fastq.gz"), - trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_2.fastq.gz") - ] - else: - raise ValueError(f"Could not determine protocol type for {wildcards.serie}") - - -rule fastqc_trim: +rule fastqc_trim: wildcard_constraints: serie="|".join(library_names_single), input: - get_trimmed_fastq + get_trimmed_fastq, output: fastqc_trim_folder.joinpath("{serie}", "{sample}_fastqc.zip"), fastqc_trim_folder.joinpath("{serie}", "{sample}_fastqc.html"), @@ -120,31 +112,6 @@ rule fastqc_trim_pe: """ -def get_trimmomatic_stats(wildcards): - ret = expand( - trim_reads_folder.joinpath(wildcards.serie, "{sample}.stats.txt"), - sample=get_samples(wildcards, samples), - ) - return ret - -def get_trimmed_fastqc(wildcards): - s = wildcards.serie - if s in library_names_paired: - ret = expand( - fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_1_fastqc.html"), - sample=get_samples(wildcards, samples), - ) + expand( - fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_2_fastqc.html"), - sample=get_samples(wildcards, samples), - ) - else: - ret = expand( - fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"), - sample=get_samples(wildcards, samples), - ) - return ret - - rule multiqc_trim: input: get_trimmomatic_stats, diff --git a/workflow/scripts/edit_condition_file.py b/workflow/scripts/edit_condition_file.py new file mode 100644 index 0000000..b5c7bc2 --- /dev/null +++ b/workflow/scripts/edit_condition_file.py @@ -0,0 +1,40 @@ +import pandas as pd +from pathlib import Path + +serie_folder = Path(snakemake.input[0]) +sample_sheet_path = Path(snakemake.input[1]) + +salmon_condition_sheet = pd.read_csv(serie_folder.joinpath("condition.csv")) +salmon_condition_sheet = salmon_condition_sheet.set_index("SampleID") + +sample_sheet = pd.read_csv(sample_sheet_path) + +if "filename" in sample_sheet.columns: + sample_sheet = sample_sheet.set_index("filename") +else: + if all(sample_sheet.filename_1.apply(lambda x: x.endswith("_sequence"))): + sample_sheet["tmp"] = sample_sheet.filename_1.apply( + lambda x: x.split("_sequence")[0] + ) + sample_sheet = sample_sheet.set_index("tmp") + else: + sample_sheet = sample_sheet.set_index("name") + +sample_sheet = sample_sheet.reindex(index=salmon_condition_sheet.index) + +joined = sample_sheet.join(salmon_condition_sheet) + +levels = joined[params.variable].tolist() +levels = list(set(levels)) +control_level = levels[0] + +joined["condition"] = joined.apply( + lambda row: "control" + if row[params.variable] == control_level + else "treatment", + axis=1, +) +joined = joined.reset_index() + +out = joined[["SampleID", "condition"]] +out.to_csv(serie_folder.joinpath("condition.csv"), index=False)