Skip to content

Commit acafc66

Browse files
committed
Add Chain.multiple_domains and Chain.batch(multiple_domains=True)
1 parent 55ace7f commit acafc66

File tree

5 files changed

+68
-11
lines changed

5 files changed

+68
-11
lines changed

abnumber/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.3.6'
1+
__version__ = '0.3.7'

abnumber/chain.py

+29-7
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
import warnings
12
from collections import OrderedDict
23
from typing import Union, List, Generator, Tuple
34
import pandas as pd
45

56
from abnumber.alignment import Alignment
67
from abnumber.common import _anarci_align, _validate_chain_type, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS, \
78
is_integer, SCHEME_BORDERS, _get_unique_chains
8-
from abnumber.exceptions import ChainParseError
9+
from abnumber.exceptions import ChainParseError, MultipleDomainsChainParseError
910
import numpy as np
1011
from Bio import SeqIO
1112
from Bio.SeqRecord import SeqRecord
@@ -99,7 +100,8 @@ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germ
99100
if not results:
100101
raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"')
101102
if len(results) > 1:
102-
raise ChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"')
103+
warnings.warn('Use Chain.multiple_domains(seq) to parse ScFvs and other sequences with multiple antibody domains')
104+
raise MultipleDomainsChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"')
103105
aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]
104106

105107
_validate_chain_type(chain_type)
@@ -183,14 +185,15 @@ def _init_from_dict(self, aa_dict, allowed_species):
183185
regions_list[region_idx][pos] = aa
184186

185187
@classmethod
186-
def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline=False, allowed_species=None):
188+
def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline=False, allowed_species=None, multiple_domains=False):
187189
"""Create multiple Chain objects from dict of sequences
188190
189191
:param seq_dict: Dictionary of sequence strings, keys are sequence identifiers
190192
:param scheme: Numbering scheme to align the sequences
191193
:param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
192194
:param assign_germline: Assign germline name using ANARCI based on best sequence identity
193195
:param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
196+
:param multiple_domains: Allow parsing multiple domains in a sequence - return dict name -> list of one or more Chain items
194197
:return: tuple with (dict of Chain objects, dict of error strings)
195198
"""
196199
assert isinstance(seq_dict, dict), f'Expected dictionary of sequences, got: {type(seq_dict).__name__}'
@@ -205,11 +208,11 @@ def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline
205208
for sequence, results, name in zip(seq_list, all_results, names):
206209
if not results:
207210
errors[name] = f'Variable chain sequence not recognized: "{sequence}"'
208-
elif len(results) > 1:
211+
elif len(results) > 1 and not multiple_domains:
212+
warnings.warn('Use multiple_domains=True to allow parsing ScFvs and other sequences with multiple antibody domains')
209213
errors[name] = f'Found {len(results)} antibody domains: "{sequence}"'
210214
else:
211-
aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]
212-
chains[name] = Chain(
215+
found_chains = [Chain(
213216
sequence=None,
214217
aa_dict=aa_dict,
215218
name=name,
@@ -220,9 +223,28 @@ def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline
220223
species=species,
221224
v_gene=v_gene,
222225
j_gene=j_gene
223-
)
226+
) for aa_dict, chain_type, tail, species, v_gene, j_gene in results]
227+
chains[name] = found_chains if multiple_domains else found_chains[0]
224228
return chains, errors
225229

230+
@classmethod
231+
def multiple_domains(cls, sequence: str, scheme: str, cdr_definition=None, name=None, assign_germline=False, allowed_species=None) -> 'Chain':
232+
"""Parse multi-domain sequence into a list of Chain objects
233+
234+
:param sequence: Unaligned string sequence
235+
:param scheme: Numbering scheme to align the sequences
236+
:param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
237+
:param name: Optional sequence identifier
238+
:param assign_germline: Assign germline name using ANARCI based on best sequence identity
239+
:param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
240+
:return: tuple with (dict of Chain objects, dict of error strings)
241+
"""
242+
chains, errors = cls.batch({name: sequence}, scheme=scheme, cdr_definition=cdr_definition, assign_germline=assign_germline, allowed_species=allowed_species, multiple_domains=True)
243+
if error := errors.get(name):
244+
raise ChainParseError(error)
245+
return chains[name]
246+
247+
226248
def __repr__(self):
227249
return self.format()
228250

abnumber/common.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,15 @@ def _anarci_align(sequences, scheme, allowed_species, assign_germline=False) ->
3737
continue
3838
assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output'
3939
results = []
40-
for (positions, start, end), ali in zip(seq_numbered, seq_ali):
40+
for i, ((positions, start, end), ali) in enumerate(zip(seq_numbered, seq_ali)):
4141
chain_type = ali['chain_type']
4242
species = ali['species']
4343
v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
4444
j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None
4545
aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
4646
for (num, letter), aa in positions if aa != '-'}
47-
tail = sequence[end+1:]
47+
next_start = None if i == len(seq_numbered) - 1 else seq_numbered[i+1][1]
48+
tail = sequence[end+1:next_start]
4849
results.append((aa_dict, chain_type, tail, species, v_gene, j_gene))
4950
all_results.append(results)
5051
return all_results

abnumber/exceptions.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
class ChainParseError(Exception):
2-
pass
2+
pass
3+
4+
class MultipleDomainsChainParseError(ChainParseError):
5+
pass

test/test_chain.py

+31
Original file line numberDiff line numberDiff line change
@@ -268,3 +268,34 @@ def test_batch():
268268
assert 'D' not in chains
269269
assert errors['D'] == 'Found 2 antibody domains: "EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSEVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS"'
270270

271+
272+
def test_batch_multiple_domains():
273+
chains, errors = Chain.batch({
274+
'A': 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTVTVSS',
275+
'B': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS',
276+
'C': 'FOO',
277+
'D': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSGGGGSQVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSS'
278+
}, scheme='imgt', multiple_domains=True)
279+
assert len(chains) == 3
280+
assert len(chains['A']) == 1
281+
assert chains['A'][0].raw[0] == 'Q'
282+
assert len(chains['B']) == 1
283+
assert chains['B'][0].raw[0] == 'E'
284+
assert 'C' not in chains
285+
assert errors['C'] == 'Variable chain sequence not recognized: "FOO"'
286+
assert len(chains['D']) == 2
287+
assert chains['D'][0].raw[0] == 'E'
288+
assert chains['D'][0].tail == 'GGGGS'
289+
assert chains['D'][1].raw[0] == 'Q'
290+
assert chains['D'][1].tail == 'S'
291+
292+
293+
def test_multiple_domains():
294+
vh = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTVTVSS'
295+
vl = 'ELVMTQSPSSLSASVGDRVNIACRASQGISSALAWYQQKPGKAPRLLIYDASNLESGVPSRFSGSGSGTDFTLTISSLQPEDFAIYYCQQFNSYPLTFGGGTKVEIK'
296+
chains = Chain.multiple_domains('MELVIS' + vh + 'GGGS' + vl + 'CCC', scheme='imgt')
297+
assert len(chains) == 2
298+
assert chains[0].seq == vh
299+
assert chains[0].tail == 'GGGS'
300+
assert chains[1].seq == vl
301+
assert chains[1].tail == 'CCC'

0 commit comments

Comments
 (0)