Skip to content

Commit

Permalink
Merge pull request #140 from phac-nml/covid19
Browse files Browse the repository at this point in the history
- added --min-kmer-frac for exclusion of low abundance kmers from subtype calling
- detailed report adds columns: `kmer_fraction`, `total_refposition_kmer_frequency`, `is_kmer_fraction_okay`
- increased default --max-kmer-freq for amplicon seq data compatibility
  • Loading branch information
peterk87 authored Mar 4, 2021
2 parents d20a00b + 15aa720 commit e3c2c82
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 11 deletions.
19 changes: 12 additions & 7 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
import attr
import pandas as pd

from . import program_desc, __version__, program_name
from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from .subtype import Subtype
from .subtype_stats import subtype_counts
from .subtyper import \
from bio_hansel import program_desc, __version__, program_name
from bio_hansel.const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from bio_hansel.subtype import Subtype
from bio_hansel.subtype_stats import subtype_counts
from bio_hansel.subtyper import \
subtype_contigs_samples, \
subtype_reads_samples
from .metadata import read_metadata_table, merge_results_with_metadata
from .utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
from bio_hansel.metadata import read_metadata_table, merge_results_with_metadata
from bio_hansel.utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
group_fastqs, collect_fasta_from_dir, init_subtyping_params, df_field_fillna)

SCRIPT_NAME = 'hansel'
Expand Down Expand Up @@ -81,6 +81,9 @@ def init_parser():
parser.add_argument('--min-kmer-freq',
type=int,
help='Min k-mer freq/coverage')
parser.add_argument('--min-kmer-frac',
type=float,
help='Proportion of k-mer required for detection (0.0 - 1)')
parser.add_argument('--max-kmer-freq',
type=int,
help='Max k-mer freq/coverage')
Expand Down Expand Up @@ -258,6 +261,8 @@ def main():
if output_kmer_results:
if len(dfs) > 0:
dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False)
#Error message is redundant accross each of the k-mers
dfall = dfall.drop(columns=['qc_message'])
dfall = df_field_fillna(dfall)
dfall.to_csv(output_kmer_results, **kwargs_for_pd_to_table)
logging.info('Kmer results written to "{}".'.format(output_kmer_results))
Expand Down
33 changes: 31 additions & 2 deletions bio_hansel/subtyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,32 @@ def parallel_query_reads(reads: List[Tuple[List[str], str]],
outputs = [x.get() for x in res]
return outputs

def get_kmer_fraction(row):
"""Calculate the percentage frequency of a given position
Args:
df: BioHansel k-mer frequence pandas df row
Returns:
- float of percentage abundance
"""
total_freq = row.total_refposition_kmer_frequency
return row.freq / total_freq if total_freq > 0 else 0.0

def calc_kmer_fraction(df):
"""Calculate the percentage each k-mer frequency represents for a given position
Args:
df: BioHansel k-mer frequence pandas df
Returns:
- pd.DataFrame with k-mers with kmer_fraction and total_refposition_kmer_frequency columns added
"""
position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().to_dict()
df['total_refposition_kmer_frequency'] = df.apply(lambda row: position_frequencies['freq'].get(row.refposition, 0), axis=1)
df['kmer_fraction'] = df.apply(get_kmer_fraction, axis=1)

return df


def subtype_reads(reads: Union[str, List[str]],
genome_name: str,
Expand Down Expand Up @@ -285,9 +311,12 @@ def subtype_reads(reads: Union[str, List[str]],
df['subtype'] = subtypes
df['is_pos_kmer'] = ~df.kmername.str.contains('negative')
df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq)
#apply a scaled approach for filtering of k-mers required for high coverage amplicon data
df = calc_kmer_fraction(df)
df['is_kmer_fraction_okay'] = df.kmer_fraction >= subtyping_params.min_kmer_frac
st.avg_kmer_coverage = df['freq'].mean()
st, df = process_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts)
st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
st, filtered_df = process_subtyping_results(st, df[(df.is_kmer_freq_okay & df.is_kmer_fraction_okay)], scheme_subtype_counts)
st.qc_status, st.qc_message = perform_quality_check(st, filtered_df, subtyping_params)
df['file_path'] = str(st.file_path)
df['sample'] = genome_name
df['scheme'] = scheme_name or scheme
Expand Down
5 changes: 3 additions & 2 deletions bio_hansel/subtyping_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ class SubtypingParams(object):
min_ambiguous_kmers = attr.ib(default=3, validator=attr.validators.instance_of(int))
max_perc_intermediate_kmers = attr.ib(default=0.05, validator=attr.validators.instance_of(float))
min_kmer_freq = attr.ib(default=8, validator=attr.validators.instance_of((float, int)))
max_kmer_freq = attr.ib(default=10000, validator=attr.validators.instance_of((float, int)))
min_kmer_frac = attr.ib(default=0.05, validator=attr.validators.instance_of(float))
max_kmer_freq = attr.ib(default=1000000, validator=attr.validators.instance_of((float, int)))
min_coverage_warning = attr.ib(default=20, validator=attr.validators.instance_of((float, int)))
max_degenerate_kmers = attr.ib(default=100000, validator=attr.validators.instance_of(int))
max_degenerate_kmers = attr.ib(default=10000000, validator=attr.validators.instance_of(int))

@max_perc_missing_kmers.validator
def _validate_max_perc_missing_kmers(self, attribute, value):
Expand Down
2 changes: 2 additions & 0 deletions bio_hansel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ def init_subtyping_params(args: Optional[Any] = None,
subtyping_params.min_coverage_warning = args.low_cov_warning
if args.min_kmer_freq:
subtyping_params.min_kmer_freq = args.min_kmer_freq
if args.min_kmer_frac:
subtyping_params.min_kmer_frac = args.min_kmer_frac
if args.max_kmer_freq:
subtyping_params.max_kmer_freq = args.max_kmer_freq
if args.max_degenerate_kmers:
Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
'is_pos_kmer', 'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message']

exp_fastq_cols = ['kmername', 'refposition', 'subtype', 'seq', 'freq', 'is_pos_kmer', 'is_kmer_freq_okay',
'kmer_fraction','total_refposition_kmer_frequency','is_kmer_fraction_okay',
'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message']


Expand Down

0 comments on commit e3c2c82

Please # to comment.