Skip to content

Commit

Permalink
Merge pull request #23 from metagenlab/nj/resistance
Browse files Browse the repository at this point in the history
Add antimicrobial resistance annotations
  • Loading branch information
njohner authored Dec 21, 2023
2 parents 29e0fa6 + 8f56624 commit 3c6e68f
Show file tree
Hide file tree
Showing 13 changed files with 374 additions and 158 deletions.
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@ asset*
.nextflow*
singularity/
work/
zdb/
zdb/results
zdb/assets
zdb/gunicorn/gunicorn.log
zdb/gunicorn/gunicorn.pid
zdb/nginx/*.log
zdb/nginx/nginx_curr.config
zdb/nginx/var
zdb_ref/
env
docs/_build/
56 changes: 54 additions & 2 deletions annotation_pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ nr_seqs.splitFasta( by: 300, file: "chunk_" )
to_blast_swissprot
to_diamond_refseq
to_kofamscan
to_pfam_scan }
to_pfam_scan
to_amr_scan }


if(params.pfam) {
Expand Down Expand Up @@ -599,6 +600,29 @@ if(params.ko) {
}


if(params.amr) {

process execute_amrscan {
container "$params.ncbi_amr_container"
conda "$baseDir/conda/ncbi_amr.yaml"

input:
file(seq) from to_amr_scan

output:
file "amrfinder_results*.tab" into amr_table

script:
n = seq.name
"""
amrfinder --plus -p ${n} > amrfinder_results_${n}.tab
"""
}
} else {
Channel.value("void").set { amr_table }
}


process setup_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"
Expand Down Expand Up @@ -887,7 +911,7 @@ process load_swissprot_hits_into_db {
file swissprot_db from Channel.fromPath("$params.swissprot_db/swissprot.fasta")

output:
file db into to_create_index
file db into to_load_amr_db

script:
if(params.blast_swissprot)
Expand All @@ -906,6 +930,34 @@ process load_swissprot_hits_into_db {
"""
}

process load_amr_into_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"

input:
file collected_amr_files from amr_table.collect()
file db from to_load_amr_db

output:
file db into to_create_index

script:
if(params.amr)
"""
#!/usr/bin/env python
import setup_chlamdb
kwargs = ${gen_python_args()}
amr_files = "${collected_amr_files}".split()
setup_chlamdb.load_amr(kwargs, amr_files, "$db")
"""
else
"""
echo \"Not supposed to load COG, passing\"
"""
}


process create_chlamdb_search_index {
container "$params.annotation_container"
Expand Down
88 changes: 56 additions & 32 deletions bin/setup_chlamdb.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
import os
import sys
import re

from metagenlab_libs import db_utils
from metagenlab_libs.chlamdb import search_bar
from metagenlab_libs import KO_module
import sys
from collections import defaultdict, namedtuple

import pandas as pd

from Bio import SeqIO
from Bio import AlignIO
from Bio import SeqUtils

from collections import namedtuple
from collections import defaultdict
from Bio import AlignIO, SeqIO, SeqUtils
from metagenlab_libs import KO_module, db_utils
from metagenlab_libs.chlamdb import search_bar
from zdb.database.database import DB


# assumes orthofinder named: OG000N
Expand Down Expand Up @@ -55,7 +49,7 @@ def is_plasmid(record):


def load_gbk(gbks, args, db_file):
db = db_utils.DB.load_db(db_file, args)
db = DB.load_db(db_file, args)
data = []

bioentry_plasmids = []
Expand Down Expand Up @@ -85,7 +79,7 @@ def load_gbk(gbks, args, db_file):


def load_orthofinder_results(orthofinder_output, args, db_file):
db = db_utils.DB.load_db(db_file, args)
db = DB.load_db(db_file, args)
hsh_prot_to_group = parse_orthofinder_output_file(orthofinder_output)
hsh_locus_to_feature_id = db.get_hsh_locus_to_seqfeature_id(only_CDS=True)
hits_to_load = [(hsh_locus_to_feature_id[locus], group)
Expand Down Expand Up @@ -150,7 +144,7 @@ def parse_record(record):


def load_refseq_matches_infos(args, lst_diamond_files, db_file):
db = db_utils.DB.load_db(db_file, args)
db = DB.load_db(db_file, args)
columns = ["str_hsh", "accession", "pident", "length", "mismatch", "gapopen",
"qstart", "qend", "sstart", "send", "evalue", "bitscore"]

Expand Down Expand Up @@ -231,7 +225,7 @@ def hsh_from_s(s):


def load_seq_hashes(args, nr_mapping, db_file):
db = db_utils.DB.load_db(db_file, args)
db = DB.load_db(db_file, args)
hsh_locus_to_id = db.get_hsh_locus_to_seqfeature_id(only_CDS=True)

to_load_hsh_to_seqid = {}
Expand All @@ -256,7 +250,7 @@ def load_seq_hashes(args, nr_mapping, db_file):


def load_alignments_results(args, identity_csvs, db_file):
db = db_utils.DB.load_db(db_file, args)
db = DB.load_db(db_file, args)
db.create_new_og_matrix()
locus_to_feature_id = db.get_hsh_locus_to_seqfeature_id(only_CDS=True)

Expand All @@ -277,7 +271,7 @@ def load_alignments_results(args, identity_csvs, db_file):


def load_cog(params, filelist, db_file, cdd_to_cog):
db = db_utils.DB.load_db(db_file, params)
db = DB.load_db(db_file, params)
hsh_cdd_to_cog = {}
for line in open(cdd_to_cog, "r"):
cog, cdd = line.split()
Expand All @@ -303,6 +297,36 @@ def load_cog(params, filelist, db_file, cdd_to_cog):
db.commit()


def amr_hit_to_db_entry(hit):
columns = ["Gene symbol",
"Sequence name",
"Scope",
"Element type",
"Element subtype",
"Class",
"Subclass",
"% Coverage of reference sequence",
"% Identity to reference sequence",
"Accession of closest sequence",
"Name of closest sequence",
"HMM id"]
entry = [hsh_from_s(hit["Protein identifier"][len("CRC-"):])]
entry.extend([hit[column] for column in columns])
return entry


def load_amr(params, filelist, db_file):
db = DB.load_db(db_file, params)

data = []
for chunk in filelist:
amr_hits = pd.read_csv(chunk, sep="\t", header=0)
data.extend(amr_hit_to_db_entry(hit) for i, hit in amr_hits.iterrows())
db.load_amr_hits(data)
db.set_status_in_config_table("AMR", 1)
db.commit()


# Note: the trees are stored in files with name formatted as:
# OGN_nr_hits_mafft.nwk. To retrieve the orthogroup, parse the filename
# and convert it to int.
Expand All @@ -313,7 +337,7 @@ def load_cog(params, filelist, db_file, cdd_to_cog):
def load_BBH_phylogenies(kwargs, lst_orthogroups, db_file):
import ete3

db = db_utils.DB.load_db(db_file, kwargs)
db = DB.load_db(db_file, kwargs)
data = []

for tree in lst_orthogroups:
Expand All @@ -331,7 +355,7 @@ def load_gene_phylogenies(kwargs, og_summary, lst_orthogroups, db_file):
"""
import ete3

db = db_utils.DB.load_db(db_file, kwargs)
db = DB.load_db(db_file, kwargs)
hsh_ogs = {}
for tree in lst_orthogroups:
t = ete3.Tree(tree)
Expand All @@ -353,7 +377,7 @@ def load_gene_phylogenies(kwargs, og_summary, lst_orthogroups, db_file):

def load_reference_phylogeny(kwargs, tree, db_file):
import ete3
db = db_utils.DB.load_db(db_file, kwargs)
db = DB.load_db(db_file, kwargs)

newick_file = open(tree, "r")
newick_string = newick_file.readline()
Expand Down Expand Up @@ -401,7 +425,7 @@ def get_gen_stats(gbk_list):


def load_genomes_info(kwargs, gbk_list, checkm_results, db_file):
db = db_utils.DB.load_db(db_file, kwargs)
db = DB.load_db(db_file, kwargs)
tab = pd.read_table(checkm_results)

hsh_taxid_to_gen_stats = get_gen_stats(gbk_list)
Expand Down Expand Up @@ -441,7 +465,7 @@ def parse_pfam_entry(file_iter):


def load_pfam(params, pfam_files, db, pfam_def_file):
db = db_utils.DB.load_db(db, params)
db = DB.load_db(db, params)

db.create_pfam_hits_table()
pfam_ids = set()
Expand Down Expand Up @@ -519,7 +543,7 @@ def parse_swissprot_id(to_parse):


def load_swissprot(params, blast_results, db_name, swissprot_fasta):
db = db_utils.DB.load_db(db_name, params)
db = DB.load_db(db_name, params)
hsh_swissprot_id = SwissProtIdCount()
db.create_swissprot_tables()

Expand Down Expand Up @@ -562,7 +586,7 @@ def simplify_ko(raw_ko):
# Several KO marked as significant can be assigned to the same locus
# only take the hit with the lowest evalue (the first in the list)
def load_KO(params, ko_files, db_name):
db = db_utils.DB.load_db(db_name, params)
db = DB.load_db(db_name, params)
data = []
for ko_file in ko_files:
curr_hsh = None
Expand Down Expand Up @@ -592,7 +616,7 @@ def load_KO(params, ko_files, db_name):


def load_module_completeness(params, db_name):
db = db_utils.DB.load_db(db_name, params)
db = DB.load_db(db_name, params)
all_genomes = db.get_genomes_description().description.to_dict()

db.create_module_completeness_table()
Expand Down Expand Up @@ -621,13 +645,13 @@ def load_module_completeness(params, db_name):
def format_cog(cog_n):
if pd.isna(cog_n):
return None
return f"COG{int(cog_n): 04}"
return f"COG{int(cog_n):04}"


def format_ko(ko_n):
if pd.isna(ko_n):
return None
return f"K{int(ko_n): 05}"
return f"K{int(ko_n):05}"


def format_og(og_n):
Expand All @@ -637,19 +661,19 @@ def format_og(og_n):
def format_pfam(pfam_n):
if pd.isna(pfam_n):
return None
return f"PF{int(pfam_n): 05}"
return f"PF{int(pfam_n):05}"


def format_module(mod):
return f"M{mod: 05d}"
return f"M{mod:05d}"


def format_pathway(pat):
return f"map{pat: 05d}"
return f"map{pat:05d}"


def setup_chlamdb_search_index(params, db_name, index_name):
db = db_utils.DB.load_db(db_name, params)
db = DB.load_db(db_name, params)
os.mkdir(index_name)

has_cog = params.get("cog", False)
Expand Down
5 changes: 5 additions & 0 deletions bin/zdb
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,10 @@ elif [[ "$1" == "run" ]]; then
args="${args} --core_missing=$n_missing"
shift
;;
--amr)
args="${args} --amr"
shift
;;
--cog)
args="${args} --cog"
shift
Expand Down Expand Up @@ -501,6 +505,7 @@ elif [[ "$1" == "run" ]]; then
echo " --conda: use conda environments instead of singularity"
echo " --name: run name (defaults to the name given by nextflow)"
echo " the latest completed run is also named latest"
echo " --amr: perform amr (antibiotic resistance) annotation"
echo " --cog: perform cog annotation"
echo " --ko: perform ko (metabolism) annotation"
echo " --pfam: peform PFAM domain annotation"
Expand Down
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ params.results_dir = "zdb/results"
params.cog = false
params.ko = false
params.pfam = false
params.amr = false

params.checkm_args = "domain Bacteria"

Expand Down Expand Up @@ -73,6 +74,7 @@ params.orthofinder_container = "quay.io/biocontainers/orthofinder:2.5.2--0"
params.diamond_container = "registry.hub.docker.com/buchfink/diamond:version2.0.13"
params.mafft_container = "quay.io/biocontainers/mafft:7.487--h779adbc_0"
params.fasttree_container = "quay.io/biocontainers/fasttree:2.1.8--h779adbc_6"
params.ncbi_amr_container = "registry.hub.docker.com/ncbi/amr:3.11.26-2023-09-26.1"


/////////////////////////////
Expand Down
Loading

0 comments on commit 3c6e68f

Please # to comment.