-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Snakefile-ref
163 lines (146 loc) · 8.14 KB
/
Snakefile-ref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
# Import config, set working dir, load manifest and define default rules:
include: "Snakefile-base"
references = pd.read_csv("../ref.cfg", sep="\t").set_index(["refgroup", "refname"], drop=False)
GENOMES = True
rule finish_zymo:
input:
base="reticulated.base.ok", # This will ensure the base rules have finished first
#checkm=expand("checkm-{assembly}.txt", assembly=enumerate_assemblies(base_only=False, unroll=True, suffix=".fa")),
fastmer="fastmer-result.txt",
reads_mapped=enumerate_reads(suffix=".map.stat.summary.tsv", cols=["ont"], refgroup=True),
output: touch("reticulated.zymo.ok")
shell: 'echo "Reticulated successfully."'
def retic_get_ref(refgroup, genome):
r = references.loc[refgroup]
refname = r["refname"].get(genome, 'default')
refloc = r.loc[refname]["ref"] if r.loc[refname]["ref"] != '-' else None
if refloc and refloc.startswith("HTTP"):
refloc = "download/refs/%s/%s.fa" % (refgroup, refname)
return refloc
#from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
#HTTP = HTTPRemoteProvider()
rule download_ref:
#input: lambda w: HTTP.remote(references.loc[w.ref.replace(".fa", "")]["ref"].replace("HTTP:", ""), keep_local=True)
params:
remote=lambda w: references.loc[w.refgroup, w.ref]["ref"].replace("HTTP:", "")
output: "download/refs/{refgroup,[A-z0-9_-]*}/{ref}.fa"
run:
shell("wget {params.remote} -O {output}")
def get_refgroup_by_uuid(uuid):
return samples.loc[uuid]["refgroup"]
def get_ref_paths_by_group(refgroup):
return [retic_get_ref(refgroup, ref) for ref in references.loc[refgroup]["refname"].values.tolist()]
def get_ref_names_by_group(refgroup):
return references.loc[refgroup]["refname"].values.tolist() # ffs pandas
def get_ref_tids_by_group(refgroup):
return references.loc[refgroup]["ncbi_tid"].values.tolist()
#TODO WIll want to perhaps genericise this for multiple ref sets
rule merge_refs:
input: lambda w: get_ref_paths_by_group(w.refgroup)
output: "{refgroup,[A-z0-9_-]*}.super_ref.fa"
params:
refnames=lambda w: get_ref_names_by_group(w.refgroup)
shell: "REFS=({input}); NAMES=({params.refnames}); for ((i=0;i<${{#REFS[@]}};++i)); do sed \"s,^>,>${{NAMES[i]}}__,\" ${{REFS[i]}}; done > {output}"
rule map_reads:
input:
ref="{refgroup}.super_ref.fa",
reads="{path}"
params:
samplename=lambda w: get_samplename_from_readpath(w.path),
output:
"{path}.{refgroup,[A-z0-9_-]*}.map.stat"
threads: lambda w: n_threads(None, "minimap2_threads")
shell: "minimap2 -a -2 --sam-hit-only --secondary=no -Q -t {threads} -x map-ont {input.ref} {input.reads} | python ../scripts/zymo/nickloman/bamstats.py - | awk '{{print \"{params.samplename}\t\" \"{wildcards.refgroup}\t\" $0}}' | bond - ../reads.cfg --dheader samplename,refgroup,refname,contig,readid,alen --dropid | bond - ../ref.cfg --dcol refgroup,refname --mcol refgroup,refname --dropid > {output}"
rule map_reads_summary:
conda: "environments/r.yaml"
input: "{path}.{refgroup}.map.stat"
output: "{path}.{refgroup,[A-z0-9_-]*}.map.stat.sumr.tsv"
shell: "Rscript ../scripts/zymo/nickloman/summariseStats.R {input} {output}"
rule bond_map_reads_summary:
input: "{path}.{refgroup}.map.stat.sumr.tsv"
output: "{path}.{refgroup,[A-z0-9_-]*}.map.stat.summary.tsv"
shell: "bond {input} ../reads.cfg --dropid > {output}"
rule extract_kraken_contigs:
input:
assembly="{uuid}.{assembly}",
k2="{uuid}.{assembly}.k2",
params:
refnames=lambda w: get_ref_names_by_group(get_refgroup_by_uuid(w.uuid))
output:
directory("extracted_contigs/{uuid,[A-z0-9_-]*}.{assembly}/")
shell:
"python ../scripts/extract_contigs_with_kraken.py {input.k2} {input.assembly} {output}; python ../scripts/zymo/ensure_genomes.py {output} {params.refnames}"
#rule presetup_checkm:
# output: directory("checkm")
# shell: "mkdir checkm; cd checkm; wget https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz; tar xvf checkm_data_2015_01_16.tar.gz"
#
#rule setup_checkm:
# conda: "environments/checkm.yaml"
# input: directory("checkm")
# output: touch("checkm_setup.ok")
# shell: "echo 'checkm/' | checkm data setRoot 'checkm/'"
#
#def checkm_pick_taxon(w):
# refname = references["refname"].get(w.genome, 'default')
# return {
# "rank": references.loc[refname]["rank"],
# "genome": references.loc[refname]["taxon"]
# }
#
## This rule is a little annoying, we *should* be explicitly naming the input
## FASTA needed for the checkM bin as extracted by our kraken script.
## Unfortunately, as the extract_kraken_contigs rule output is defined as a dictionary
## we cannot explicitly name an input inside that directory without a ChildIOException.
## That rule has a python script that attempts to guarantee those files exist, so
## as long as the directory exists, we can be reasonably confident the FASTA do too.
## As gross as it sounds, the requisite `cp` will fail if it *really* doesn't exist.
## So this isn't a particularly unsafe workaround, just a gross one.
#rule checkm:
# input:
# ok="checkm_setup.ok",
# d=directory("extracted_contigs/{assembly}/")
# #fa="extracted_contigs/{assembly}/{genome}.fasta"
# output:
# working=directory("checkm-working/{assembly}/{genome}"),
# bin=directory("checkm-bins/{assembly}/{genome}/"),
# res="checkm-results/{assembly}/{genome}.tsv"
# params: p=checkm_pick_taxon
# conda: "environments/checkm.yaml"
# threads: 8
# shell: "cp extracted_contigs/{wildcards.assembly}/{wildcards.genome}.fasta {output.bin}; checkm taxonomy_wf -q -t {threads} -x fasta {params.p[rank]} {params.p[genome]:q} {output.bin} {output.working} > {output.res}"
#
#rule merge_checkm:
# input:
# logs=expand("checkm-results/{{assembly}}/{genome}.tsv", genome=GENOMES)
# output:
# "checkm-{assembly}.txt"
# shell:
# "cat {input.logs} > {output}"
rule jts_fastmer:
input:
reference=lambda w: retic_get_ref(get_refgroup_by_uuid(w.uuid), w.genome),
assembly="{uuid}.{assembly}",
output:
res="fastmer-results/{uuid,[A-z0-9_-]*}.{assembly}/{genome}.tsv",
bam=temp("{uuid,[A-z0-9_-]*}.{assembly}.{genome}.assembly_analysis.sorted.bam"),
bai=temp("{uuid,[A-z0-9_-]*}.{assembly}.{genome}.assembly_analysis.sorted.bam.bai"),
resources:
minlen=lambda w, attempt: [50000, 10000, 5000, 0][min(attempt,4) - 1] # Set the default minlen on the first attempt, then give up with 0 on attempt 4+
threads: 4
shell: "python ../scripts/zymo/jts/fastmer.py --sort-flags '%s' --sort-threads {threads} --temp-bam {output.bam} --reference {input.reference} --assembly {input.assembly} --min-mapping-quality 10 --min-alignment-length {resources.minlen} | bond - ../manifest.cfg --greedy --append suffix:{wildcards.assembly},refname:{wildcards.genome} > {output.res}" % (config["sort_flags"])
rule merge_fastmer:
input:
#logs=lambda w: expand("fastmer-results/{{uuid}}.{{assembly}}/{genome}.tsv", genome=[ref for ref in get_ref_names_by_group(get_refgroup_by_uuid(w.uuid)) if retic_get_ref(get_refgroup_by_uuid(w.uuid), ref)])
logs=lambda w: ["fastmer-results/{uuid}.{assembly}/%s.tsv" % g for g in [ref for ref in get_ref_names_by_group(get_refgroup_by_uuid(w.uuid)) if get_refgroup_by_uuid(w.uuid) != '-' and retic_get_ref(get_refgroup_by_uuid(w.uuid), ref)]]
output:
"fastmer-{uuid,[A-z0-9_-]*}.{assembly}.txt"
shell:
"cat {input.logs} | awk '!/^assembly_name/ || ++n < 2' > {output}"
rule super_merge_fastmer:
input:
logs=expand("fastmer-{uuid}.{assembly}.txt", zip, uuid=[x.split('.')[0] for x in enumerate_assemblies(base_only=False, unroll=True, suffix=".fa", refgroup=True)], assembly=[x.split('.', 1)[1] for x in enumerate_assemblies(base_only=False, unroll=True, suffix=".fa", refgroup=True)]) # what the fuck
output:
"fastmer-result.txt"
shell:
"cat {input.logs} | awk '!/^assembly_name/ || ++n < 2' | bond - ../reads.cfg --dcol samplename --dropid | bond - ../ref.cfg --dcol refgroup,refname --mcol refgroup,refname --dropid > {output}"