Skip to content

Commit a8c1bba

Browse files
committed
Add Chain.batch() for parsing multiple seqs at once
1 parent e5d8e2b commit a8c1bba

File tree

4 files changed

+92
-25
lines changed

4 files changed

+92
-25
lines changed

abnumber/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.3.2'
1+
__version__ = '0.3.3'

abnumber/chain.py

+48-5
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from collections import OrderedDict
22
from typing import Union, List, Generator, Tuple
3-
from Bio import SeqIO
4-
from Bio.SeqRecord import SeqRecord
53
import pandas as pd
64

75
from abnumber.alignment import Alignment
86
from abnumber.common import _anarci_align, _validate_chain_type, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS, \
97
is_integer, SCHEME_BORDERS, _get_unique_chains
108
from abnumber.exceptions import ChainParseError
119
import numpy as np
10+
from Bio import SeqIO
11+
from Bio.SeqRecord import SeqRecord
1212
from Bio.Seq import Seq
1313

1414
from abnumber.position import Position
@@ -83,6 +83,8 @@ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germ
8383
else:
8484
if sequence is None:
8585
raise ChainParseError('Expected sequence, got None')
86+
if isinstance(sequence, list):
87+
raise ChainParseError('Expected string or Seq, got list. Please use Chain.batch() to parse multiple sequences')
8688
if not isinstance(sequence, str) and not isinstance(sequence, Seq):
8789
raise ChainParseError(f'Expected string or Seq, got {type(sequence)}: {sequence}')
8890
if '-' in sequence:
@@ -93,7 +95,9 @@ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germ
9395
raise ChainParseError('Do not use tail= when providing sequence=, it will be inferred automatically')
9496
if isinstance(sequence, Seq):
9597
sequence = str(sequence)
96-
results = _anarci_align(sequence, scheme=scheme, allowed_species=allowed_species, assign_germline=assign_germline)
98+
results = _anarci_align([sequence], scheme=scheme, allowed_species=allowed_species, assign_germline=assign_germline)[0]
99+
if not results:
100+
raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"')
97101
if len(results) > 1:
98102
raise ChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"')
99103
aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]
@@ -157,10 +161,10 @@ def _init_from_dict(self, aa_dict, allowed_species):
157161
else:
158162
seq = ''.join(aa_dict[pos] for pos in sorted_positions)
159163
renumbered_aa_dict = _anarci_align(
160-
seq,
164+
[seq],
161165
scheme=self.cdr_definition if self.cdr_definition != 'north' else 'chothia',
162166
allowed_species=allowed_species
163-
)[0][0]
167+
)[0][0][0]
164168
cdr_definition_positions = [pos.number for pos in sorted(renumbered_aa_dict.keys())]
165169
combined_aa_dict = {}
166170
for orig_pos, cdr_definition_position in zip(sorted_positions, cdr_definition_positions):
@@ -178,6 +182,45 @@ def _init_from_dict(self, aa_dict, allowed_species):
178182
region_idx += 1
179183
regions_list[region_idx][pos] = aa
180184

185+
@classmethod
186+
def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline=False, allowed_species=None):
187+
"""Create multiple Chain objects from dict of sequences
188+
189+
:param seq_dict: Dictionary of sequence strings, keys are sequence identifiers
190+
:param scheme: Numbering scheme to align the sequences
191+
:param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
192+
:param assign_germline: Assign germline name using ANARCI based on best sequence identity
193+
:param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
194+
:return: tuple with (dict of Chain objects, dict of error strings)
195+
"""
196+
assert isinstance(seq_dict, dict), f'Expected dictionary of sequences, got: {type(seq_dict).__name__}'
197+
names = list(seq_dict.keys())
198+
seq_list = list(seq_dict.values())
199+
all_results = _anarci_align(seq_list, scheme=scheme, allowed_species=allowed_species, assign_germline=assign_germline)
200+
names = names or ([None] * len(seq_list))
201+
chains = {}
202+
errors = {}
203+
for sequence, results, name in zip(seq_list, all_results, names):
204+
if not results:
205+
errors[name] = f'Variable chain sequence not recognized: "{sequence}"'
206+
elif len(results) > 1:
207+
errors[name] = f'Found {len(results)} antibody domains: "{sequence}"'
208+
else:
209+
aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]
210+
chains[name] = Chain(
211+
sequence=None,
212+
aa_dict=aa_dict,
213+
name=name,
214+
scheme=scheme,
215+
chain_type=chain_type,
216+
cdr_definition=cdr_definition,
217+
tail=tail,
218+
species=species,
219+
v_gene=v_gene,
220+
j_gene=j_gene
221+
)
222+
return chains, errors
223+
181224
def __repr__(self):
182225
return self.format()
183226

abnumber/common.py

+22-19
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,34 @@ def _validate_chain_type(chain_type):
2020
f'Invalid chain type "{chain_type}", it should be "H" (heavy), "L" (lambda light chian) or "K" (kappa light chain)'
2121

2222

23-
def _anarci_align(sequence, scheme, allowed_species, assign_germline=False) -> List[Tuple]:
23+
def _anarci_align(sequences, scheme, allowed_species, assign_germline=False) -> List[List[Tuple]]:
2424
from abnumber.position import Position
25-
sequence = re.sub(WHITESPACE, '', sequence)
25+
assert isinstance(sequences, list), f'Expected list of sequences, got: {type(sequences)}'
2626
all_numbered, all_ali, all_hits = anarci(
27-
[('id', sequence)],
27+
[(f'id{i}', re.sub(WHITESPACE, '', sequence)) for i, sequence in enumerate(sequences)],
2828
scheme=scheme,
2929
allowed_species=allowed_species,
3030
assign_germline=assign_germline
3131
)
32-
seq_numbered = all_numbered[0]
33-
seq_ali = all_ali[0]
34-
if seq_numbered is None:
35-
raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"')
36-
assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output'
37-
results = []
38-
for (positions, start, end), ali in zip(seq_numbered, seq_ali):
39-
chain_type = ali['chain_type']
40-
species = ali['species']
41-
v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
42-
j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None
43-
aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
44-
for (num, letter), aa in positions if aa != '-'}
45-
tail = sequence[end+1:]
46-
results.append((aa_dict, chain_type, tail, species, v_gene, j_gene))
47-
return results
32+
all_results = []
33+
for sequence, seq_numbered, seq_ali in zip(sequences, all_numbered, all_ali):
34+
if seq_numbered is None:
35+
# Variable chain sequence not recognized
36+
all_results.append([])
37+
continue
38+
assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output'
39+
results = []
40+
for (positions, start, end), ali in zip(seq_numbered, seq_ali):
41+
chain_type = ali['chain_type']
42+
species = ali['species']
43+
v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
44+
j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None
45+
aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
46+
for (num, letter), aa in positions if aa != '-'}
47+
tail = sequence[end+1:]
48+
results.append((aa_dict, chain_type, tail, species, v_gene, j_gene))
49+
all_results.append(results)
50+
return all_results
4851

4952

5053
def _get_unique_chains(chains):

test/test_chain.py

+21
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ def test_invalid_chain_raises_error(scheme):
121121
Chain('AAA', scheme=scheme)
122122

123123

124+
def test_multiple_chains_raises_error():
125+
with pytest.raises(ChainParseError):
126+
Chain('QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSQVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS', scheme='imgt')
127+
128+
124129
def test_aho_without_cdr_definition_raises_error():
125130
with pytest.raises(ValueError):
126131
Chain('QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS', scheme='aho')
@@ -247,3 +252,19 @@ def test_nearest_j_region():
247252

248253
assert nearest_j[0].name == 'IGHJ6*01'
249254

255+
256+
def test_batch():
257+
chains, errors = Chain.batch({
258+
'A': 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTVTVSS',
259+
'B': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS',
260+
'C': 'FOO',
261+
'D': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSEVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS'
262+
}, scheme='imgt')
263+
assert len(chains) == 2
264+
assert chains['A'].raw[0] == 'Q'
265+
assert chains['B'].raw[0] == 'E'
266+
assert 'C' not in chains
267+
assert errors['C'] == 'Variable chain sequence not recognized: "FOO"'
268+
assert 'D' not in chains
269+
assert errors['D'] == 'Found 2 antibody domains: "EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSEVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS"'
270+

0 commit comments

Comments
 (0)