Skip to content

Commit

Permalink
Skip certain genes in user-provided regions #288 (#303)
Browse files Browse the repository at this point in the history
* skip certain genes in user-provided regions

Due to the manifold sources for non-standard triplet CDS, we decided to skip these CDS features in the user provided regions, as they cause massive manual efforts to fix these cases.

* fix unknown var name
  • Loading branch information
oschwengers authored Jul 17, 2024
1 parent f90c836 commit 5b894da
Showing 1 changed file with 26 additions and 1 deletion.
27 changes: 26 additions & 1 deletion bakta/features/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,12 +214,19 @@ def import_user_cdss(genome: dict, import_path: Path):
else:
contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t')
if(feature_type.lower() == 'cds'):
attributes = attributes.lower().split(';')
contig = contigs_by_id.get(contig_id, None)
if(contig is None):
log.error('user-provided CDS: No contig found for id=%s', contig_id)
raise Exception(f'user-provided CDS: No contig found for id={contig_id}')
user_cds = create_cds(contig, int(start), int(stop), strand, '', '')
user_cds['source'] = bc.CDS_SOURCE_USER
if('pseudo=' in attributes or bc.INSDC_FEATURE_PSEUDOGENE in attributes): # skip pseudo genes
log.debug(
'skip user-provided CDS: reason=pseudogene contig=%s, start=%i, stop=%i, strand=%s',
user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand']
)
continue
try:
nt = bu.extract_feature_sequence(user_cds, contig)
user_cds['nt'] = nt
Expand Down Expand Up @@ -247,12 +254,30 @@ def import_user_cdss(genome: dict, import_path: Path):
with xopen(str(import_path), threads=0) as fh_in:
for record in SeqIO.parse(fh_in, 'genbank'):
for feature in record.features:
if(feature.type.lower() == 'cds' and 'pseudo' not in feature.qualifiers and bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers):
if(feature.type.lower() == 'cds'):
contig = contigs_by_id.get(record.id, None)
if(contig is None):
log.error('user-provided CDS: No contig found for id=%s', record.id)
raise Exception(f'user-provided CDS: No contig found for id={record.id}')
strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
if('<' in str(feature.location.start) or '>' in str(feature.location.end)):
log.debug(
'skip user-provided CDS: reason=partial, contig=%s, start=%s, stop=%s, strand=%s',
contig['id'], feature.location.start, feature.location.end, strand
)
continue
elif('pseudo' in feature.qualifiers or bc.INSDC_FEATURE_PSEUDOGENE in feature.qualifiers):
log.debug(
'skip user-provided CDS: reason=pseudo, contig=%s, start=%i, stop=%i, strand=%s',
contig['id'], feature.location.start, feature.location.end, strand
)
continue
elif('ribosomal_slippage' in feature.qualifiers):
log.debug(
'skip user-provided CDS: reason=ribosomal slippage, contig=%s, start=%i, stop=%i, strand=%s',
contig['id'], feature.location.start, feature.location.end, strand
)
continue
user_cds = create_cds(contig, feature.location.start + 1, feature.location.end, strand, '', '')
user_cds['source'] = bc.CDS_SOURCE_USER
try:
Expand Down

0 comments on commit 5b894da

Please # to comment.