diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 3c639811..19e555b1 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -214,12 +214,19 @@ def import_user_cdss(genome: dict, import_path: Path): else: contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t') if(feature_type.lower() == 'cds'): + attributes = attributes.lower().split(';') contig = contigs_by_id.get(contig_id, None) if(contig is None): log.error('user-provided CDS: No contig found for id=%s', contig_id) raise Exception(f'user-provided CDS: No contig found for id={contig_id}') user_cds = create_cds(contig, int(start), int(stop), strand, '', '') user_cds['source'] = bc.CDS_SOURCE_USER + if('pseudo=' in attributes or bc.INSDC_FEATURE_PSEUDOGENE in attributes): # skip pseudo genes + log.debug( + 'skip user-provided CDS: reason=pseudogene contig=%s, start=%i, stop=%i, strand=%s', + user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'] + ) + continue try: nt = bu.extract_feature_sequence(user_cds, contig) user_cds['nt'] = nt @@ -247,12 +254,30 @@ def import_user_cdss(genome: dict, import_path: Path): with xopen(str(import_path), threads=0) as fh_in: for record in SeqIO.parse(fh_in, 'genbank'): for feature in record.features: - if(feature.type.lower() == 'cds' and 'pseudo' not in feature.qualifiers and bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers): + if(feature.type.lower() == 'cds'): contig = contigs_by_id.get(record.id, None) if(contig is None): log.error('user-provided CDS: No contig found for id=%s', record.id) raise Exception(f'user-provided CDS: No contig found for id={record.id}') strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE + if('<' in str(feature.location.start) or '>' in str(feature.location.end)): + log.debug( + 'skip user-provided CDS: reason=partial, contig=%s, start=%s, stop=%s, strand=%s', + contig['id'], feature.location.start, feature.location.end, strand + ) + continue + elif('pseudo' in feature.qualifiers or bc.INSDC_FEATURE_PSEUDOGENE in feature.qualifiers): + log.debug( + 'skip user-provided CDS: reason=pseudo, contig=%s, start=%i, stop=%i, strand=%s', + contig['id'], feature.location.start, feature.location.end, strand + ) + continue + elif('ribosomal_slippage' in feature.qualifiers): + log.debug( + 'skip user-provided CDS: reason=ribosomal slippage, contig=%s, start=%i, stop=%i, strand=%s', + contig['id'], feature.location.start, feature.location.end, strand + ) + continue user_cds = create_cds(contig, feature.location.start + 1, feature.location.end, strand, '', '') user_cds['source'] = bc.CDS_SOURCE_USER try: