-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsort.smk
87 lines (76 loc) · 2.79 KB
/
sort.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
This part of the workflow handles sorting downloaded sequences and metadata
into a and b by aligning them to reference sequences.
It produces output files as
metadata = "data/{type}/metadata.tsv"
sequences = "data/{type}/sequences.fasta"
"""
rule sort:
input:
sequences = rules.transform.output.sequences
output:
"data/a/sequences.fasta",
"data/b/sequences.fasta"
shell:
'''
seqkit rmdup {input.sequences} | \
nextclade3 sort - --output-dir tmp
mv tmp/nextstrain/rsv/b/sequences.fasta data/b/sequences.fasta
mv tmp/nextstrain/rsv/a/sequences.fasta data/a/sequences.fasta
rm -r tmp
'''
rule metadata:
input:
metadata = rules.transform.output.metadata,
sequences = "data/{type}/sequences.fasta"
output:
metadata = "data/{type}/metadata_raw.tsv"
run:
import pandas as pd
from Bio import SeqIO
strains = [s.id for s in SeqIO.parse(input.sequences, 'fasta')]
d = pd.read_csv(input.metadata, sep='\t', index_col='accession').loc[strains].drop_duplicates()
d.to_csv(output.metadata, sep='\t')
rule nextclade_dataset:
output:
ref_a = "rsv-a_nextclade/reference.fasta",
ref_b = "rsv-b_nextclade/reference.fasta"
params:
dataset_a = "nextstrain/rsv/a/EPI_ISL_412866",
dataset_b = "nextstrain/rsv/b/EPI_ISL_1653999"
shell:
"""
nextclade3 dataset get -n {params.dataset_a} --output-dir rsv-a_nextclade
nextclade3 dataset get -n {params.dataset_b} --output-dir rsv-b_nextclade
"""
rule nextclade:
input:
sequences = "data/{type}/sequences.fasta",
ref = "rsv-a_nextclade/reference.fasta"
output:
nextclade = "data/{type}/nextclade.tsv"
params:
dataset = "rsv-{type}_nextclade",
output_columns = "clade qc.overallScore qc.overallStatus alignmentScore alignmentStart alignmentEnd coverage dynamic"
threads: 8
shell:
"""
nextclade3 run -D {params.dataset} -j {threads} \
--output-columns-selection {params.output_columns} \
--output-tsv {output.nextclade} \
{input.sequences}
"""
rule extend_metadata:
input:
nextclade = "data/{type}/nextclade.tsv",
metadata = "data/{type}/metadata_raw.tsv"
output:
metadata = "data/{type}/metadata.tsv"
shell:
"""
python3 bin/extend-metadata.py --metadata {input.metadata} \
--id-field accession \
--virus-type {wildcards.type} \
--nextclade {input.nextclade} \
--output {output.metadata}
"""