From 43a86188759b6435465c856ad9b3e64433043de3 Mon Sep 17 00:00:00 2001
From: Francesco Tabaro <francesco.tabaro@embl.it>
Date: Tue, 18 Jul 2023 15:51:51 +0200
Subject: [PATCH] lint

---
 env/pandas.yml                           |   4 +
 workflow/Snakefile                       | 137 +----------
 workflow/include/common.smk              | 288 +++++++++++++++++++++++
 workflow/include/coverage_tRNA.smk       |  28 +--
 workflow/include/deseq2.smk              |  72 +++---
 workflow/include/download-references.smk |  10 +-
 workflow/include/fastqc.smk              |  48 +---
 workflow/include/make_bw.smk             |   6 +-
 workflow/include/picard_markdup.smk      |  22 +-
 workflow/include/salmonTE.smk            |  92 ++------
 workflow/include/star.smk                |  25 +-
 workflow/include/starTE.smk              |  23 +-
 workflow/include/trim_single.smk         |  57 +----
 workflow/scripts/edit_condition_file.py  |  40 ++++
 14 files changed, 432 insertions(+), 420 deletions(-)
 create mode 100644 env/pandas.yml
 create mode 100644 workflow/include/common.smk
 create mode 100644 workflow/scripts/edit_condition_file.py

diff --git a/env/pandas.yml b/env/pandas.yml
new file mode 100644
index 0000000..973e071
--- /dev/null
+++ b/env/pandas.yml
@@ -0,0 +1,4 @@
+channels:
+  - anaconda
+dependencies:
+  - pandas
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 61d178a..1608129 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -29,129 +29,11 @@ container: "docker://condaforge/mambaforge:latest"
 
 configfile: "config/config.yaml"
 
+#############################
+## INCLUDE COMMON FUNCTIONS
+#############################
 
-############
-## HELPERS
-############
-
-
-def giga_to_byte(g):
-    g = g - 2  # leave some memory to other processes
-    return g * (1024**3)
-
-
-def get_filename(link, decompress=False, stem=False):
-    # sometimes we use this function to get filenames of paths instead of links.
-    # convert back Path to str.
-    if isinstance(link, Path):
-        link = str(link)
-    # parse
-    p = urlparse(link)
-
-    # cast to Path and get name attribute
-    if stem:
-        basename = Path(p.path).stem
-    else:
-        basename = Path(p.path).name
-
-    # remove .gz
-    if decompress and str(basename).endswith(".gz"):
-        basename = basename.replace(".gz", "")
-
-    return basename
-
-
-def get_samples(wildcards, samples):
-    if wildcards.serie in samples["single"]:
-        s = samples["single"][wildcards.serie]
-    else:
-        s = samples["paired"][wildcards.serie]
-    return s
-
-def get_bw(wildcards):
-    """Builds bigwig paths for rule all"""
-    o = []
-    for lib in library_names_single + library_names_paired:
-        if lib in samples["single"].keys():
-            s = samples["single"][lib]
-        else:
-            s = samples["paired"][lib]
-        o += expand(star_folder.joinpath("{serie}", "{sample}.bw"), serie=lib, sample=s)
-    return o
-
-
-def get_star_input(wildcards):
-    """Builds input paths for STAR alignment testing if a library is single-end or paired-end"""
-    if wildcards.serie in library_names_single:
-        for ext in supported_extensions:
-            infile = trim_reads_folder.joinpath(
-                wildcards.serie, f"{wildcards.sample}.{ext}"
-            )
-            if os.path.exists(infile):
-                break
-    else:
-        for ext in supported_extensions:
-            infile = [
-                trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_1.{ext}"),
-                trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_2.{ext}"),
-            ]
-            if all([os.path.exists(f) for f in infile]):
-                break
-    return infile
-
-
-def get_params(wildcards, key):
-    """Returns the value of a specific key for the current serie"""
-    params = ""
-    for lib in config["sequencing_libraries"]:
-        if lib["name"] == wildcards.serie:
-            params = lib[key]
-    return params
-
-
-def get_sample_sheet(wildcards):
-    """Returns path to sample sheet for current serie"""
-    return get_params(wildcards, "sample_sheet")
-
-
-def get_fastq(wildcards):
-    if wildcards.serie in library_names_single:
-        for ext in supported_extensions:
-            candidate = raw_reads_folder.joinpath(
-                wildcards.serie, f"{wildcards.sample}.{ext}"
-            )
-            if os.path.exists(candidate):
-                return candidate
-        raise ValueError(
-            f"Could not find FastQ file. Check your naming. Supported extensions: {supported_extensions}"
-        )
-    return ""
-
-
-def get_fastq_paired(wildcards):
-    if wildcards.serie in library_names_paired:
-        for ext in supported_extensions:
-            for suffix in supported_suffixes:
-                candidate1 = raw_reads_folder.joinpath(
-                    wildcards.serie, f"{wildcards.sample}{suffix[0]}.{ext}"
-                )
-                candidate2 = raw_reads_folder.joinpath(
-                    wildcards.serie, f"{wildcards.sample}{suffix[1]}.{ext}"
-                )
-                if os.path.exists(candidate1) and os.path.exists(candidate2):
-                    return {"m1": candidate1, "m2": candidate2}
-        raise ValueError(
-            f"Could not find FastQ files. Check your naming.\nPaired-end suffixed: {supported_suffixes}.\nSupported extensions: {supported_extensions}"
-        )
-    return ""
-
-
-def mkdir(p: Path, verbose=False):
-    if not p.exists():
-        p.mkdir(parents=True, exist_ok=True)
-        if verbose:
-            print("Created {}".format(p))
-
+include: "include/common.smk"
 
 #######################
 ## DEFINE VARIABLES
@@ -208,9 +90,6 @@ gtf_path = references_folder.joinpath(
 
 rmsk_path = Path(config["genome"]["rmsk_path"])
 config["genome"]["rmsk_link"] = None
-# rmsk_path = references_folder.joinpath(
-#     get_filename(config["genome"]["rmsk_link"], decompress=False)
-# )
 rmsk_bed = Path(str(rmsk_path).replace("gtf", "bed"))
 
 gaf_path = references_folder.joinpath(
@@ -284,9 +163,10 @@ onstart:
 
     print_json(json.dumps(samples))
 
-############
-## RULES
-############
+
+#############
+## WORKFLOW
+#############
 
 
 wildcard_constraints:
@@ -295,6 +175,7 @@ wildcard_constraints:
     method="multihit|random",
 
 
+
 include: "include/download-references.smk"
 include: "include/fastqc.smk"
 include: "include/trim_single.smk"
diff --git a/workflow/include/common.smk b/workflow/include/common.smk
new file mode 100644
index 0000000..843136e
--- /dev/null
+++ b/workflow/include/common.smk
@@ -0,0 +1,288 @@
+def giga_to_byte(g):
+    g = g - 2  # leave some memory to other processes
+    return g * (1024**3)
+
+
+def get_filename(link, decompress=False, stem=False):
+    # sometimes we use this function to get filenames of paths instead of links.
+    # convert back Path to str.
+    if isinstance(link, Path):
+        link = str(link)
+    # parse
+    p = urlparse(link)
+
+    # cast to Path and get name attribute
+    if stem:
+        basename = Path(p.path).stem
+    else:
+        basename = Path(p.path).name
+
+    # remove .gz
+    if decompress and str(basename).endswith(".gz"):
+        basename = basename.replace(".gz", "")
+
+    return basename
+
+
+def get_samples(wildcards, samples):
+    if wildcards.serie in samples["single"]:
+        s = samples["single"][wildcards.serie]
+    else:
+        s = samples["paired"][wildcards.serie]
+    return s
+
+
+def get_bw(wildcards):
+    """Builds bigwig paths for rule all"""
+    o = []
+    for lib in library_names_single + library_names_paired:
+        if lib in samples["single"].keys():
+            s = samples["single"][lib]
+        else:
+            s = samples["paired"][lib]
+        o += expand(star_folder.joinpath("{serie}", "{sample}.bw"), serie=lib, sample=s)
+    return o
+
+
+def get_star_input(wildcards):
+    """Builds input paths for STAR alignment testing if a library is single-end or paired-end"""
+    if wildcards.serie in library_names_single:
+        for ext in supported_extensions:
+            infile = trim_reads_folder.joinpath(
+                wildcards.serie, f"{wildcards.sample}.{ext}"
+            )
+            if os.path.exists(infile):
+                break
+    else:
+        for ext in supported_extensions:
+            infile = [
+                trim_reads_folder.joinpath(
+                    wildcards.serie, f"{wildcards.sample}_1.{ext}"
+                ),
+                trim_reads_folder.joinpath(
+                    wildcards.serie, f"{wildcards.sample}_2.{ext}"
+                ),
+            ]
+            if all([os.path.exists(f) for f in infile]):
+                break
+    return infile
+
+
+def get_params(wildcards, key):
+    """Returns the value of a specific key for the current serie"""
+    params = ""
+    for lib in config["sequencing_libraries"]:
+        if lib["name"] == wildcards.serie:
+            params = lib[key]
+    return params
+
+
+def get_sample_sheet(wildcards):
+    """Returns path to sample sheet for current serie"""
+    return get_params(wildcards, "sample_sheet")
+
+
+def get_fastq(wildcards):
+    if wildcards.serie in library_names_single:
+        for ext in supported_extensions:
+            candidate = raw_reads_folder.joinpath(
+                wildcards.serie, f"{wildcards.sample}.{ext}"
+            )
+            if os.path.exists(candidate):
+                return candidate
+        raise ValueError(
+            f"Could not find FastQ file. Check your naming. Supported extensions: {supported_extensions}"
+        )
+    return ""
+
+
+def get_fastq_paired(wildcards):
+    if wildcards.serie in library_names_paired:
+        for ext in supported_extensions:
+            for suffix in supported_suffixes:
+                candidate1 = raw_reads_folder.joinpath(
+                    wildcards.serie, f"{wildcards.sample}{suffix[0]}.{ext}"
+                )
+                candidate2 = raw_reads_folder.joinpath(
+                    wildcards.serie, f"{wildcards.sample}{suffix[1]}.{ext}"
+                )
+                if os.path.exists(candidate1) and os.path.exists(candidate2):
+                    return {"m1": candidate1, "m2": candidate2}
+        raise ValueError(
+            f"Could not find FastQ files. Check your naming.\nPaired-end suffixed: {supported_suffixes}.\nSupported extensions: {supported_extensions}"
+        )
+    return ""
+
+
+def mkdir(p: Path, verbose=False):
+    if not p.exists():
+        p.mkdir(parents=True, exist_ok=True)
+        if verbose:
+            print("Created {}".format(p))
+
+
+def get_tRNA_annotation_file(wildcards):
+    checkpoint_output = checkpoints.download_gtRNAdb.get(**wildcards).output[0]
+    bed_filename = glob_wildcards(os.path.join(checkpoint_output, "{x}.bed")).x[0]
+    return tRNA_annotation_dir.joinpath(f"{bed_filename}.bed")
+
+
+def get_trna_coverage(wildcards):
+    return expand(
+        trna_coverage_folder.joinpath("{{serie}}", "{sample}.bed"),
+        sample=get_samples(wildcards, samples),
+    )
+
+
+def get_deseq2_test(wildcards):
+    deseq2_params = get_params(wildcards, "deseq2")
+    return deseq2_params["test"]
+
+
+def get_deseq2_variable(wildcards):
+    deseq2_params = get_params(wildcards, "deseq2")
+    return deseq2_params["variable"]
+
+
+def get_markdup_bam(wildcards):
+    return expand(
+        markdup_folder.joinpath("{{serie}}/{sample}.markdup.bam"),
+        sample=get_samples(wildcards, samples),
+    )
+
+
+def get_markdup_fastqc(wildcards):
+    return expand(
+        fastqc_markdup_folder.joinpath("{{serie}}", "{sample}.markdup_fastqc.html"),
+        sample=get_samples(wildcards, samples),
+    )
+
+
+def get_salmonTE_quant_input(wildcards):
+    if wildcards.serie in library_names_single:
+        s = samples["single"][wildcards.serie]
+    else:
+        s = [
+            f"{s}{mate}"
+            for s in samples["paired"][wildcards.serie]
+            for mate in ["_1", "_2"]
+        ]
+    for extension in supported_extensions:
+        candidates = expand(
+            raw_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}.{extension}"),
+            sample=s,
+        )
+        if all([os.path.exists(f) for f in candidates]):
+            break
+
+    return candidates
+
+
+def set_salmonTE_genome():
+    genome_label = config["genome"]["label"][:2]
+    salmon_label = ""
+    if genome_label == "mm":
+        salmon_label = "mm"
+    elif genome_label == "hg":
+        salmon_label = "hs"
+    elif genome_label == "dr":
+        salmon_label = "dr"
+    elif genome_label == "dm":
+        salmon_label = "dm"
+    else:
+        raise ValueError(f'Unsupported genome label: {config["genome"]["label"]}')
+    return salmon_label
+
+
+def get_star_stats(wildcards):
+    return expand(
+        star_folder.joinpath("{{serie}}/{sample}.Log.final.out"),
+        sample=get_samples(wildcards, samples),
+    )
+
+
+def get_star_fastqc(wildcards):
+    return expand(
+        fastqc_star_folder.joinpath(
+            "{{serie}}", "{sample}.Aligned.sortedByCoord.out_fastqc.html"
+        ),
+        sample=get_samples(wildcards, samples),
+    )
+
+
+def get_trimmed_fastq(wildcards):
+    if wildcards.serie in library_names_single:
+        return trim_reads_folder.joinpath(
+            wildcards.serie, f"{wildcards.sample}.fastq.gz"
+        )
+    elif wildcards.serie in library_names_paired:
+        return [
+            trim_reads_folder.joinpath(
+                wildcards.serie, f"{wildcards.sample}_1.fastq.gz"
+            ),
+            trim_reads_folder.joinpath(
+                wildcards.serie, f"{wildcards.sample}_2.fastq.gz"
+            ),
+        ]
+    else:
+        raise ValueError(f"Could not determine protocol type for {wildcards.serie}")
+
+
+def get_trimmomatic_stats(wildcards):
+    ret = expand(
+        trim_reads_folder.joinpath(wildcards.serie, "{sample}.stats.txt"),
+        sample=get_samples(wildcards, samples),
+    )
+    return ret
+
+
+def get_trimmed_fastqc(wildcards):
+    s = wildcards.serie
+    if s in library_names_paired:
+        ret = [ *expand(
+            fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_1_fastqc.html"),
+            sample=get_samples(wildcards, samples),
+        ), *expand(
+            fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_2_fastqc.html"),
+            sample=get_samples(wildcards, samples),
+        )]
+    else:
+        ret = expand(
+            fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"),
+            sample=get_samples(wildcards, samples),
+        )
+    return ret
+
+
+def get_fastqc(wildcards):
+    s = get_samples(wildcards, samples)
+    if wildcards.serie in library_names_single:
+        ret = expand(
+            fastqc_raw_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"),
+            sample=s,
+        )
+    else:
+        for m in supported_suffixes:
+            for ext in supported_extensions:
+                m1 = expand(
+                    os.path.join(
+                        raw_reads_folder, wildcards.serie, f"{wildcards.sample}{m[0]}.{ext}"
+                    ),
+                    sample=s,
+                )
+                m2 = expand(
+                    os.path.join(
+                        raw_reads_folder, wildcards.serie, f"{wildcards.sample}{m[1]}.{ext}"
+                    ),
+                    sample=s,
+                )
+
+                if all([os.path.exists(p) for p in m1 + m2]):
+                    ret = [
+                        *expand(fastqc_raw_folder.joinpath(wildcards.serie, f"{wildcards.sample}{m[0]}_fastqc.html"
+                        ),
+                        sample=s), 
+                        *expand(fastqc_raw_folder.joinpath(wildcards.serie, f"{wildcards.sample}{m[1]}_fastqc.html"),
+                        sample=s)]
+
+    return ret
diff --git a/workflow/include/coverage_tRNA.smk b/workflow/include/coverage_tRNA.smk
index 97479b8..f736339 100644
--- a/workflow/include/coverage_tRNA.smk
+++ b/workflow/include/coverage_tRNA.smk
@@ -15,21 +15,16 @@ rule mk_genome_tsv:
         log_folder.joinpath("mk_genome_tsv/{serie}/{sample}.log"),
     shell:
         """
-        samtools view -H {input[0]} | grep SQ | cut -f 2,3 | sed -r 's/(SN|LN)://g' | tr " " "\t" > {output}
+        samtools view -H {input} | grep SQ | cut -f 2,3 | sed -r 's/(SN|LN)://g' | tr " " "\t" > {output}
         """
 
-def get_tRNA_annotation_file(wildcards):
-    checkpoint_output = checkpoints.download_gtRNAdb.get(**wildcards).output[0]
-    bed_filename = glob_wildcards(os.path.join(checkpoint_output, "{x}.bed")).x[0]
-    return tRNA_annotation_dir.joinpath(f"{bed_filename}.bed")
-
 
 rule coverage_trna:
     input:
-        star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam"),
-        star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam.bai"),
-        star_folder.joinpath("{serie}", "{sample}.genome"),
-        get_tRNA_annotation_file,
+        bam=star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam"),
+        bai=star_folder.joinpath("{serie}", "{sample}.Aligned.sortedByCoord.out.bam.bai"),
+        genome=star_folder.joinpath("{serie}", "{sample}.genome"),
+        annotation=get_tRNA_annotation_file,
     output:
         trna_coverage_folder.joinpath("{serie}", "{sample}.bed"),
     conda:
@@ -40,21 +35,14 @@ rule coverage_trna:
         """
         set -x 
         SORTED=$(mktemp)
-        JOINED=$(join -o 1.1 <(awk '{{print $1}}' {input[3]} | sort -u) <(awk '{{print $1}}' {input[2]} | sort -u)) 
-        awk -v j="$JOINED" '$1~j' {input[3]} | grep -v 'chrUn' | bedtools sort -faidx {input[2]} -i - > $SORTED
+        JOINED=$(join -o 1.1 <(awk '{{print $1}}' {input.genome} | sort -u) <(awk '{{print $1}}' {input.bai} | sort -u)) 
+        awk -v j="$JOINED" '$1~j' {input.genome} | grep -v 'chrUn' | bedtools sort -faidx {input.bai} -i - > $SORTED
         
-        bedtools coverage -g {input[2]} -sorted -a $SORTED -b {input[0]} | sort -k1,1 -k2,2n > {output}
+        bedtools coverage -g {input.genome} -sorted -a $SORTED -b {input.bam} | sort -k1,1 -k2,2n > {output}
         rm $SORTED
         """
 
 
-def get_trna_coverage(wildcards):
-    return expand(
-        trna_coverage_folder.joinpath("{{serie}}", "{sample}.bed"),
-        sample=get_samples(wildcards, samples),
-    )
-
-
 rule build_trna_coverage_matrix:
     input:
         get_trna_coverage,
diff --git a/workflow/include/deseq2.smk b/workflow/include/deseq2.smk
index fc79da7..3261f15 100644
--- a/workflow/include/deseq2.smk
+++ b/workflow/include/deseq2.smk
@@ -11,16 +11,6 @@ rule subset_gtf:
         "../scripts/subset_gtf_v1.R"
 
 
-def get_deseq2_test(wildcards):
-    deseq2_params = get_params(wildcards, "deseq2")
-    return deseq2_params["test"]
-
-
-def get_deseq2_variable(wildcards):
-    deseq2_params = get_params(wildcards, "deseq2")
-    return deseq2_params["variable"]
-
-
 rule deseq2:
     input:
         star_flag=star_folder.joinpath("{serie}.done"),
@@ -51,34 +41,34 @@ rule deseq2:
         "../scripts/deseq2_v1.R"
 
 
-rule deseq2_report:
-    input:
-        analysis_folder.joinpath("deseq2-{serie}.done"),
-        gaf_path,
-        ann_path=str(rdata_folder.joinpath("deseq2/ann.rds")),
-    output:
-        touch(analysis_folder.joinpath("deseq2-report-{serie}.done")),
-        # notebooks_folder.joinpath("{serie}/%s.html"%get_filename(deseq2_notebook_input_path, stem=True))
-    params:
-        dds_path=str(rdata_folder.joinpath("deseq2/{serie}/dds.rds")),
-        # ann_path = str(rdata_folder.joinpath("deseq2/ann.rds")),
-        notebook_path=deseq2_notebook_input_path,
-        # root_path = deseq2_working_directory,
-        output_dir=notebooks_folder,
-        annotation_type=config["genome"]["annotation_type"],
-    conda:
-        "../../env/R.yml"
-    log:
-        log_folder.joinpath("R/{serie}/deseq2_report.log"),
-    shell:
-        """
-         set -x
-         if [ -f {params.dds_path} ]; then
-            OUTPUT_DIR="{params.output_dir}/{wildcards.serie}"
-            R --no-echo --vanilla \
-            -e "rmarkdown::render('{params.notebook_path}', output_dir = '$OUTPUT_DIR', output_format = 'html_document', params = list(output_dir = '$OUTPUT_DIR', ann_path = '{input.ann_path}', dds_path = '{params.dds_path}', annotation_type = '{params.annotation_type}', gaf_path = '{input[1]}'))" |& \
-            tee {log}
-        else
-            exit 0
-        fi
-         """
+# rule deseq2_report:
+#     input:
+#         analysis_folder.joinpath("deseq2-{serie}.done"),
+#         gaf_path,
+#         ann_path=str(rdata_folder.joinpath("deseq2/ann.rds")),
+#     output:
+#         touch(analysis_folder.joinpath("deseq2-report-{serie}.done")),
+#         # notebooks_folder.joinpath("{serie}/%s.html"%get_filename(deseq2_notebook_input_path, stem=True))
+#     params:
+#         dds_path=str(rdata_folder.joinpath("deseq2/{serie}/dds.rds")),
+#         # ann_path = str(rdata_folder.joinpath("deseq2/ann.rds")),
+#         notebook_path=deseq2_notebook_input_path,
+#         # root_path = deseq2_working_directory,
+#         output_dir=notebooks_folder,
+#         annotation_type=config["genome"]["annotation_type"],
+#     conda:
+#         "../../env/R.yml"
+#     log:
+#         log_folder.joinpath("R/{serie}/deseq2_report.log"),
+#     shell:
+#         """
+#          set -x
+#          if [ -f {params.dds_path} ]; then
+#             OUTPUT_DIR="{params.output_dir}/{wildcards.serie}"
+#             R --no-echo --vanilla \
+#             -e "rmarkdown::render('{params.notebook_path}', output_dir = '$OUTPUT_DIR', output_format = 'html_document', params = list(output_dir = '$OUTPUT_DIR', ann_path = '{input.ann_path}', dds_path = '{params.dds_path}', annotation_type = '{params.annotation_type}', gaf_path = '{input[1]}'))" |& \
+#             tee {log}
+#         else
+#             exit 0
+#         fi
+#          """
diff --git a/workflow/include/download-references.smk b/workflow/include/download-references.smk
index af8901a..7c452e6 100644
--- a/workflow/include/download-references.smk
+++ b/workflow/include/download-references.smk
@@ -34,7 +34,6 @@ rule download_genome_annotation_file:
         "../scripts/download-gtf.sh"
 
 
-
 rule download_repeatmasker_annotation_file:
     output:
         rmsk_path,
@@ -60,7 +59,9 @@ checkpoint download_gtRNAdb:
     output:
         directory(tRNA_annotation_dir),
     params:
-        url=config["genome"]["gtrnadb_url"]
+        url=config["genome"]["gtrnadb_url"],
+    log:
+        log_folder.joinpath("download/gtrnadb.log"),    
     conda:
         "../../env/wget.yml"
     shell:
@@ -68,10 +69,11 @@ checkpoint download_gtRNAdb:
         mkdir -p {output}
         cd {output}
         F=$(basename {params.url})
-        wget -q {params.url}
-        tar xf $F
+        wget -q {params.url} |& tee {log}
+        tar xvf $F |& tee -a {log}
         """
 
+
 rule download_gaf_file:
     output:
         gaf_path,
diff --git a/workflow/include/fastqc.smk b/workflow/include/fastqc.smk
index b19149b..354d7d9 100644
--- a/workflow/include/fastqc.smk
+++ b/workflow/include/fastqc.smk
@@ -2,7 +2,7 @@
 
 rule fastqc_raw:
     input:
-        get_fastq
+        get_fastq,
     output:
         fastqc_raw_folder.joinpath("{serie}", "{sample}_fastqc.zip"),
         fastqc_raw_folder.joinpath("{serie}", "{sample}_fastqc.html"),
@@ -19,9 +19,10 @@ rule fastqc_raw:
         fastqc -t {threads} -noextract -o {params.fastqc_folder} {input}
         """
 
+
 use rule fastqc_raw as fastqc_raw_pe with:
     input:
-        unpack(get_fastq_paired)
+        unpack(get_fastq_paired),
     output:
         fastqc_raw_folder.joinpath("{serie}", "{sample}_1_fastqc.zip"),
         fastqc_raw_folder.joinpath("{serie}", "{sample}_1_fastqc.html"),
@@ -33,49 +34,6 @@ use rule fastqc_raw as fastqc_raw_pe with:
         log_folder.joinpath("fastqc/{serie}/{sample}.log"),
 
 
-# rule fastqc_raw_pe:
-#     input:
-#         unpack(get_fastq_paired)
-#     output:
-#         fastqc_raw_folder.joinpath("{serie}", "{sample}_1_sequence.fq.gz_fastqc.zip"),
-#         fastqc_raw_folder.joinpath("{serie}", "{sample}_1_sequence.fq.gz_fastqc.html"),
-#         fastqc_raw_folder.joinpath("{serie}", "{sample}_2_sequence.fq.gz_fastqc.zip"),
-#         fastqc_raw_folder.joinpath("{serie}", "{sample}_2_sequence.fq.gz_fastqc.html"),
-#     params:
-#         fastqc_folder=fastqc_raw_folder,
-#     threads: 2
-#     conda:
-#         "../../env/qc.yml"
-#     log:
-#         log_folder.joinpath("fastqc/{serie}/{sample}.log"),
-#     shell:
-#         """
-#         set -x
-#         fastqc -t {threads} -noextract -o {params.fastqc_folder}/{wildcards.serie} {input.m1} {input.m2}
-#         """
-
-
-def get_fastqc(wildcards):
-    s = get_samples(wildcards, samples)
-    if wildcards.serie in library_names_single:
-        ret = expand(
-            fastqc_raw_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"),
-            sample=s,
-        )
-    else:
-        for m in supported_suffixes:
-            for ext in supported_extensions:
-                m1 = expand(os.path.join(raw_reads_folder, wildcards.serie, "{sample}" + f"{m[0]}.{ext}"), sample = s)
-                m2 = expand(os.path.join(raw_reads_folder, wildcards.serie, "{sample}" + f"{m[1]}.{ext}"), sample = s)
-                
-                if all([os.path.exists(p) for p in m1 + m2]):
-                    ret = expand(
-                        fastqc_raw_folder.joinpath(wildcards.serie, "{sample}" + f"{m[0]}_fastqc.html"), sample=s
-                        ) + expand(
-                            fastqc_raw_folder.joinpath(wildcards.serie, "{sample}" + f"{m[1]}_fastqc.html"), sample=s
-                        )
-                                    
-    return ret
 
 
 rule multiqc_raw:
diff --git a/workflow/include/make_bw.smk b/workflow/include/make_bw.smk
index a1ed0da..3b9c842 100644
--- a/workflow/include/make_bw.smk
+++ b/workflow/include/make_bw.smk
@@ -1,7 +1,7 @@
 rule coverage:
     input:
-        star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam"),
-        star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam.bai"),
+        bam=star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam"),
+        bai=star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam.bai"),
     output:
         star_folder.joinpath("{serie}", "{sample}.bw"),
     conda:
@@ -14,7 +14,7 @@ rule coverage:
     shell:
         """
 
-        bamCoverage -b {input[0]} \
+        bamCoverage -b {input.bam} \
         -o {output} -of bigwig \
         -p {threads} \
         --effectiveGenomeSize 2652783500 \
diff --git a/workflow/include/picard_markdup.smk b/workflow/include/picard_markdup.smk
index 4523c0f..fea12de 100644
--- a/workflow/include/picard_markdup.smk
+++ b/workflow/include/picard_markdup.smk
@@ -4,8 +4,8 @@ rule picard_markdup:
     input:
         star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam"),
     output:
-        markdup_folder.joinpath("{serie}/{sample}.markdup.bam"),
-        markdup_folder.joinpath("{serie}/{sample}.markdup.stats.txt"),
+        bam=markdup_folder.joinpath("{serie}/{sample}.markdup.bam"),
+        stats=markdup_folder.joinpath("{serie}/{sample}.markdup.stats.txt"),
     log:
         log_folder.joinpath("picard/{serie}/{sample}.log"),
     threads: 2
@@ -17,8 +17,8 @@ rule picard_markdup:
 
         picard MarkDuplicates \
         I={input} \
-        O={output[0]} \
-        M={output[1]} |& \
+        O={output.bam} \
+        M={output.stats} |& \
         tee {log}
         """
 
@@ -49,20 +49,6 @@ rule fastqc_markdup:
         """
 
 
-def get_markdup_bam(wildcards):
-    return expand(
-        markdup_folder.joinpath("{{serie}}/{sample}.markdup.bam"),
-        sample=get_samples(wildcards, samples),
-    )
-
-
-def get_markdup_fastqc(wildcards):
-    return expand(
-        fastqc_markdup_folder.joinpath("{{serie}}", "{sample}.markdup_fastqc.html"),
-        sample=get_samples(wildcards, samples),
-    )
-
-
 rule multiqc_markdup:
     input:
         get_markdup_bam,
diff --git a/workflow/include/salmonTE.smk b/workflow/include/salmonTE.smk
index f32ad34..b5c24c7 100644
--- a/workflow/include/salmonTE.smk
+++ b/workflow/include/salmonTE.smk
@@ -9,86 +9,24 @@ rule edit_condition_file:
     output:
         touch(data_folder.joinpath("salmonTE/quant/{serie}/edit_condition.done")),
     params:
-        variable=lambda wildcards: get_deseq2_variable(wildcards)
+        variable=lambda wildcards: get_deseq2_variable(wildcards),
     log:
         log_folder.joinpath("salmonTE/{serie}/edit_condition.log"),
-    run:
-        import pandas as pd
-        from pathlib import Path
+    conda:
+        "../../env/pandas.yml"
+    script:
+        "../scripts/edit_condition_file.py"
 
-        serie_folder = Path(input[0])
-        sample_sheet_path = Path(input[1])
-
-        salmon_condition_sheet = pd.read_csv(serie_folder.joinpath("condition.csv"))
-        salmon_condition_sheet = salmon_condition_sheet.set_index("SampleID")
-
-        sample_sheet = pd.read_csv(sample_sheet_path)
-
-        if "filename" in sample_sheet.columns:
-            sample_sheet = sample_sheet.set_index("filename")
-        else:
-            if all(sample_sheet.filename_1.apply(lambda x: x.endswith("_sequence"))):
-                sample_sheet["tmp"] = sample_sheet.filename_1.apply(
-                    lambda x: x.split("_sequence")[0]
-                )
-                sample_sheet = sample_sheet.set_index("tmp")
-            else:
-                sample_sheet = sample_sheet.set_index("name")
-
-        sample_sheet = sample_sheet.reindex(index=salmon_condition_sheet.index)
-
-        joined = sample_sheet.join(salmon_condition_sheet)
-        
-        levels = joined[params.variable].tolist()
-        levels = list(set(levels))
-        control_level = levels[0]
-
-        joined["condition"] = joined.apply(
-            lambda row: "control" if row[params.variable] == control_level else "treatment", axis=1
-        )
-        joined = joined.reset_index()
-
-        out = joined[["SampleID", "condition"]]
-        out.to_csv(serie_folder.joinpath("condition.csv"), index=False)
-
-
-def get_salmonTE_quant_input(wildcards):
-    if wildcards.serie in library_names_single:
-        s = samples["single"][wildcards.serie]
-    else:
-        s = [ f"{s}{mate}" for s in samples["paired"][wildcards.serie] for mate in ["_1", "_2"] ]
-    for extension in supported_extensions:
-        candidates = expand(raw_reads_folder.joinpath(wildcards.serie, "{sample}" + f".{extension}"), sample = s)
-        if all([os.path.exists(f) for f in candidates]):
-            break
-
-    return candidates
-
-def set_salmonTE_genome():
-    genome_label = config["genome"]["label"][:2]
-    salmon_label = ""
-    if genome_label == "mm":
-        salmon_label = "mm"
-    elif genome_label == "hg":
-        salmon_label = "hs"
-    elif genome_label == "dr":
-        salmon_label = "dr"
-    elif genome_label == "dm":
-        salmon_label = "dm"
-    else:
-        raise ValueError(f'Unsupported genome label: {config["genome"]["label"]}')
-    return salmon_label
 
 checkpoint salmonTE_quant:
     input:
-        get_salmonTE_quant_input
+        get_salmonTE_quant_input,
     output:
-        directory(salmonTE_folder.joinpath("quant/{serie}")),
-        salmonTE_folder.joinpath("quant/{serie}/EXPR.csv"),
-        salmonTE_folder.joinpath("quant/{serie}/MAPPING_INFO.csv"),
-        salmonTE_folder.joinpath("quant/{serie}/clades.csv"),
-        salmonTE_folder.joinpath("quant/{serie}/condition.csv"),
-
+        outfolder=directory(salmonTE_folder.joinpath("quant/{serie}")),
+        expr=salmonTE_folder.joinpath("quant/{serie}/EXPR.csv"),
+        mapping_info=salmonTE_folder.joinpath("quant/{serie}/MAPPING_INFO.csv"),
+        clades=salmonTE_folder.joinpath("quant/{serie}/clades.csv"),
+        condition=salmonTE_folder.joinpath("quant/{serie}/condition.csv"),
     params:
         # 13/10/2020 available references: hs mm dr dm
         # https://github.com/LiuzLab/SalmonTE#running-the-quant-mode-to-collect-te-expressions
@@ -115,7 +53,7 @@ checkpoint salmonTE_quant:
 
         python /opt/SalmonTE/SalmonTE.py quant \
         --reference={params.reference_genome} \
-        --outpath={output[0]} \
+        --outpath={output.outfolder} \
         --num_threads={threads} $I |& \
         tee {log}
         """
@@ -123,8 +61,8 @@ checkpoint salmonTE_quant:
 
 rule salmonTE_test:
     input:
-        salmonTE_folder.joinpath("quant/{serie}"),
-        salmonTE_folder.joinpath("quant/{serie}/edit_condition.done"),
+        infolder=salmonTE_folder.joinpath("quant/{serie}"),
+        condition_file=salmonTE_folder.joinpath("quant/{serie}/edit_condition.done"),
     output:
         directory(salmonTE_folder.joinpath("de_analysis/{serie}")),
     log:
@@ -134,7 +72,7 @@ rule salmonTE_test:
     shell:
         """
         python /opt/SalmonTE/SalmonTE.py test \
-        --inpath={input[0]} \
+        --inpath={input.infolder} \
         --outpath={output} \
         --tabletype=csv \
         --figtype=png \
diff --git a/workflow/include/star.smk b/workflow/include/star.smk
index f2f145a..f818ad2 100644
--- a/workflow/include/star.smk
+++ b/workflow/include/star.smk
@@ -32,7 +32,7 @@ rule star_genome_preparation:
 
 rule star:
     input:
-        get_star_input,
+        bam=get_star_input,
         star_index_folder=references_folder.joinpath("STAR"),
         genome_annotation_file=gtf_path,
     output:
@@ -60,12 +60,6 @@ rule star:
          set -x
          TMP_FOLDER=$(mktemp -u -p {params.tmp_folder})
 
-         if [ {params.libtype} == "SINGLE" ]; then
-            INPUTARG="{input[0]}"
-         else
-            INPUTARG="{input[0]} {input[1]}"
-         fi
-
          STAR --quantMode TranscriptomeSAM GeneCounts \
          --outTmpDir $TMP_FOLDER \
          --outSAMtype BAM SortedByCoordinate \
@@ -76,7 +70,7 @@ rule star:
          --genomeDir {input.star_index_folder} \
          --readFilesCommand zcat \
          --outFileNamePrefix {params.alignments_folder}/{wildcards.serie}/{wildcards.sample}. \
-         --readFilesIn $INPUTARG \
+         --readFilesIn {input.bam} \
          --limitBAMsortRAM {params.mem_mb} \
          --genomeLoad NoSharedMemory \
          --outSAMunmapped Within \
@@ -124,21 +118,6 @@ rule fastqc_star:
         """
 
 
-def get_star_stats(wildcards):
-    return expand(
-        star_folder.joinpath("{{serie}}/{sample}.Log.final.out"),
-        sample=get_samples(wildcards, samples),
-    )
-
-
-def get_star_fastqc(wildcards):
-    return expand(
-        fastqc_star_folder.joinpath(
-            "{{serie}}", "{sample}.Aligned.sortedByCoord.out_fastqc.html"
-        ),
-        sample=get_samples(wildcards, samples),
-    )
-
 rule verify_star:
     input:
         lambda wildcards: expand(
diff --git a/workflow/include/starTE.smk b/workflow/include/starTE.smk
index 15e06f8..9b2075b 100644
--- a/workflow/include/starTE.smk
+++ b/workflow/include/starTE.smk
@@ -1,6 +1,6 @@
 rule starTE_random:
     input:
-        get_star_input,
+        bam=get_star_input,
         star_index_folder=references_folder.joinpath("STAR"),
     output:
         starTE_folder.joinpath("{serie}/random/{sample}.Aligned.out.bam"),
@@ -23,12 +23,6 @@ rule starTE_random:
          echo 'tmp: $TMP_FOLDER'
          sleep 10
          
-         if [ {params.libtype} == "SINGLE" ]; then
-            INPUTARG="{input[0]}"
-         else
-            INPUTARG="{input[0]} {input[1]}"
-         fi
-
          STAR \
             --outSAMtype BAM Unsorted \
             --runMode alignReads \
@@ -53,7 +47,7 @@ rule starTE_random:
             --runThreadN {threads} \
             --genomeDir {input.star_index_folder} \
             --outFileNamePrefix {params.alignments_folder}/{wildcards.serie}/random/{wildcards.sample}. \
-            --readFilesIn $INPUTARG \
+            --readFilesIn {input.bam} \
             --limitBAMsortRAM {params.mem_mb} \
             --outBAMcompression -1 |& \
          tee {log}
@@ -61,6 +55,7 @@ rule starTE_random:
          [[ -d $TMP_FOLDER ]] && rm -r $TMP_FOLDER || exit 0
          """
 
+
 rule featureCounts_random:
     input:
         bam=lambda wildcards: expand(
@@ -84,6 +79,7 @@ rule featureCounts_random:
          featureCounts -M -F GTF -T {threads} -s 0 -a {input.annotation} -o {output} {input.bam}
          """
 
+
 rule starTE_multihit:
     input:
         get_star_input,
@@ -108,13 +104,7 @@ rule starTE_multihit:
          TMP_FOLDER=$(mktemp -u -p {params.tmp_folder})
          echo 'tmp: $TMP_FOLDER'
          sleep 10
-         
-         if [ {params.libtype} == "SINGLE" ]; then
-            INPUTARG="{input[0]}"
-         else
-            INPUTARG="{input[0]} {input[1]}"
-         fi
-         
+                  
          STAR \
             --outSAMtype BAM Unsorted \
             --runMode alignReads \
@@ -137,7 +127,7 @@ rule starTE_multihit:
             --genomeDir {input.star_index_folder} \
             --readFilesCommand zcat \
             --outFileNamePrefix {params.alignments_folder}/{wildcards.serie}/multihit/{wildcards.sample}. \
-            --readFilesIn $INPUTARG \
+            --readFilesIn {input.bam} \
             --limitBAMsortRAM {params.mem_mb} \
             --genomeLoad NoSharedMemory \
             --outBAMcompression -1 |& \
@@ -146,6 +136,7 @@ rule starTE_multihit:
          [[ -d $TMP_FOLDER ]] && rm -r $TMP_FOLDER || exit 0
          """
 
+
 rule featureCounts_multihit:
     input:
         bam=lambda wildcards: expand(
diff --git a/workflow/include/trim_single.smk b/workflow/include/trim_single.smk
index b20ba79..dc3e2e3 100644
--- a/workflow/include/trim_single.smk
+++ b/workflow/include/trim_single.smk
@@ -1,10 +1,13 @@
 ruleorder: trimmomatic_pe > trimmomatic_se
 
+
 rule trimmomatic_pe:
     wildcard_constraints:
-        serie="|".join(library_names_paired if len(library_names_paired) > 0 else ["none"]),
+        serie="|".join(
+            library_names_paired if len(library_names_paired) > 0 else ["none"]
+        ),
     input:
-        unpack(get_fastq_paired)
+        unpack(get_fastq_paired),
     output:
         paired1=trim_reads_folder.joinpath("{serie}", "{sample}_1.fastq.gz"),
         paired2=trim_reads_folder.joinpath("{serie}", "{sample}_2.fastq.gz"),
@@ -30,11 +33,14 @@ rule trimmomatic_pe:
         {params} |& tee {output.stats}
         """
 
+
 rule trimmomatic_se:
     wildcard_constraints:
-        serie="|".join(library_names_single if len(library_names_single) > 0 else ["none"]),
+        serie="|".join(
+            library_names_single if len(library_names_single) > 0 else ["none"]
+        ),
     input:
-        get_fastq
+        get_fastq,
     output:
         fastq=trim_reads_folder.joinpath("{serie}", "{sample}.fastq.gz"),
         summary=trim_reads_folder.joinpath("{serie}", "{sample}.summary.txt"),
@@ -57,25 +63,11 @@ rule trimmomatic_se:
         """
 
 
-
-
-def get_trimmed_fastq(wildcards):
-    if wildcards.serie in library_names_single:
-        return trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}.fastq.gz")
-    elif wildcards.serie in library_names_paired:
-        return [
-            trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_1.fastq.gz"),
-            trim_reads_folder.joinpath(wildcards.serie, f"{wildcards.sample}_2.fastq.gz")
-        ]
-    else:
-        raise ValueError(f"Could not determine protocol type for {wildcards.serie}")
-
-
-rule fastqc_trim:        
+rule fastqc_trim:
     wildcard_constraints:
         serie="|".join(library_names_single),
     input:
-        get_trimmed_fastq        
+        get_trimmed_fastq,
     output:
         fastqc_trim_folder.joinpath("{serie}", "{sample}_fastqc.zip"),
         fastqc_trim_folder.joinpath("{serie}", "{sample}_fastqc.html"),
@@ -120,31 +112,6 @@ rule fastqc_trim_pe:
         """
 
 
-def get_trimmomatic_stats(wildcards):
-    ret = expand(
-        trim_reads_folder.joinpath(wildcards.serie, "{sample}.stats.txt"),
-        sample=get_samples(wildcards, samples),
-    )
-    return ret
-
-def get_trimmed_fastqc(wildcards):
-    s = wildcards.serie
-    if s in library_names_paired:
-        ret = expand(
-            fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_1_fastqc.html"),
-            sample=get_samples(wildcards, samples),
-        ) + expand(
-            fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_2_fastqc.html"),
-            sample=get_samples(wildcards, samples),
-        )
-    else:
-        ret = expand(
-            fastqc_trim_folder.joinpath(wildcards.serie, "{sample}_fastqc.html"),
-            sample=get_samples(wildcards, samples),
-        )
-    return ret
-
-
 rule multiqc_trim:
     input:
         get_trimmomatic_stats,
diff --git a/workflow/scripts/edit_condition_file.py b/workflow/scripts/edit_condition_file.py
new file mode 100644
index 0000000..b5c7bc2
--- /dev/null
+++ b/workflow/scripts/edit_condition_file.py
@@ -0,0 +1,40 @@
+import pandas as pd
+from pathlib import Path
+
+serie_folder = Path(snakemake.input[0])
+sample_sheet_path = Path(snakemake.input[1])
+
+salmon_condition_sheet = pd.read_csv(serie_folder.joinpath("condition.csv"))
+salmon_condition_sheet = salmon_condition_sheet.set_index("SampleID")
+
+sample_sheet = pd.read_csv(sample_sheet_path)
+
+if "filename" in sample_sheet.columns:
+    sample_sheet = sample_sheet.set_index("filename")
+else:
+    if all(sample_sheet.filename_1.apply(lambda x: x.endswith("_sequence"))):
+        sample_sheet["tmp"] = sample_sheet.filename_1.apply(
+            lambda x: x.split("_sequence")[0]
+        )
+        sample_sheet = sample_sheet.set_index("tmp")
+    else:
+        sample_sheet = sample_sheet.set_index("name")
+
+sample_sheet = sample_sheet.reindex(index=salmon_condition_sheet.index)
+
+joined = sample_sheet.join(salmon_condition_sheet)
+
+levels = joined[params.variable].tolist()
+levels = list(set(levels))
+control_level = levels[0]
+
+joined["condition"] = joined.apply(
+    lambda row: "control"
+    if row[params.variable] == control_level
+    else "treatment",
+    axis=1,
+)
+joined = joined.reset_index()
+
+out = joined[["SampleID", "condition"]]
+out.to_csv(serie_folder.joinpath("condition.csv"), index=False)