1
1
from collections import OrderedDict
2
2
from typing import Union , List , Generator , Tuple
3
- from Bio import SeqIO
4
- from Bio .SeqRecord import SeqRecord
5
3
import pandas as pd
6
4
7
5
from abnumber .alignment import Alignment
8
6
from abnumber .common import _anarci_align , _validate_chain_type , SUPPORTED_SCHEMES , SUPPORTED_CDR_DEFINITIONS , \
9
7
is_integer , SCHEME_BORDERS , _get_unique_chains
10
8
from abnumber .exceptions import ChainParseError
11
9
import numpy as np
10
+ from Bio import SeqIO
11
+ from Bio .SeqRecord import SeqRecord
12
12
from Bio .Seq import Seq
13
13
14
14
from abnumber .position import Position
@@ -83,6 +83,8 @@ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germ
83
83
else :
84
84
if sequence is None :
85
85
raise ChainParseError ('Expected sequence, got None' )
86
+ if isinstance (sequence , list ):
87
+ raise ChainParseError ('Expected string or Seq, got list. Please use Chain.batch() to parse multiple sequences' )
86
88
if not isinstance (sequence , str ) and not isinstance (sequence , Seq ):
87
89
raise ChainParseError (f'Expected string or Seq, got { type (sequence )} : { sequence } ' )
88
90
if '-' in sequence :
@@ -93,7 +95,9 @@ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germ
93
95
raise ChainParseError ('Do not use tail= when providing sequence=, it will be inferred automatically' )
94
96
if isinstance (sequence , Seq ):
95
97
sequence = str (sequence )
96
- results = _anarci_align (sequence , scheme = scheme , allowed_species = allowed_species , assign_germline = assign_germline )
98
+ results = _anarci_align ([sequence ], scheme = scheme , allowed_species = allowed_species , assign_germline = assign_germline )[0 ]
99
+ if not results :
100
+ raise ChainParseError (f'Variable chain sequence not recognized: "{ sequence } "' )
97
101
if len (results ) > 1 :
98
102
raise ChainParseError (f'Found { len (results )} antibody domains in sequence: "{ sequence } "' )
99
103
aa_dict , chain_type , tail , species , v_gene , j_gene = results [0 ]
@@ -157,10 +161,10 @@ def _init_from_dict(self, aa_dict, allowed_species):
157
161
else :
158
162
seq = '' .join (aa_dict [pos ] for pos in sorted_positions )
159
163
renumbered_aa_dict = _anarci_align (
160
- seq ,
164
+ [ seq ] ,
161
165
scheme = self .cdr_definition if self .cdr_definition != 'north' else 'chothia' ,
162
166
allowed_species = allowed_species
163
- )[0 ][0 ]
167
+ )[0 ][0 ][ 0 ]
164
168
cdr_definition_positions = [pos .number for pos in sorted (renumbered_aa_dict .keys ())]
165
169
combined_aa_dict = {}
166
170
for orig_pos , cdr_definition_position in zip (sorted_positions , cdr_definition_positions ):
@@ -178,6 +182,45 @@ def _init_from_dict(self, aa_dict, allowed_species):
178
182
region_idx += 1
179
183
regions_list [region_idx ][pos ] = aa
180
184
185
+ @classmethod
186
+ def batch (cls , seq_dict : dict , scheme : str , cdr_definition = None , assign_germline = False , allowed_species = None ):
187
+ """Create multiple Chain objects from dict of sequences
188
+
189
+ :param seq_dict: Dictionary of sequence strings, keys are sequence identifiers
190
+ :param scheme: Numbering scheme to align the sequences
191
+ :param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
192
+ :param assign_germline: Assign germline name using ANARCI based on best sequence identity
193
+ :param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
194
+ :return: tuple with (dict of Chain objects, dict of error strings)
195
+ """
196
+ assert isinstance (seq_dict , dict ), f'Expected dictionary of sequences, got: { type (seq_dict ).__name__ } '
197
+ names = list (seq_dict .keys ())
198
+ seq_list = list (seq_dict .values ())
199
+ all_results = _anarci_align (seq_list , scheme = scheme , allowed_species = allowed_species , assign_germline = assign_germline )
200
+ names = names or ([None ] * len (seq_list ))
201
+ chains = {}
202
+ errors = {}
203
+ for sequence , results , name in zip (seq_list , all_results , names ):
204
+ if not results :
205
+ errors [name ] = f'Variable chain sequence not recognized: "{ sequence } "'
206
+ elif len (results ) > 1 :
207
+ errors [name ] = f'Found { len (results )} antibody domains: "{ sequence } "'
208
+ else :
209
+ aa_dict , chain_type , tail , species , v_gene , j_gene = results [0 ]
210
+ chains [name ] = Chain (
211
+ sequence = None ,
212
+ aa_dict = aa_dict ,
213
+ name = name ,
214
+ scheme = scheme ,
215
+ chain_type = chain_type ,
216
+ cdr_definition = cdr_definition ,
217
+ tail = tail ,
218
+ species = species ,
219
+ v_gene = v_gene ,
220
+ j_gene = j_gene
221
+ )
222
+ return chains , errors
223
+
181
224
def __repr__ (self ):
182
225
return self .format ()
183
226
0 commit comments