11
11
##===----------------------------------------------------------------------===##
12
12
13
13
import re
14
+ import sys
15
+ import codecs
14
16
15
17
class UnicodeProperty (object ):
16
18
"""Abstract base class for Unicode properties."""
@@ -64,11 +66,11 @@ def __init__(self, grapheme_break_property_file_name):
64
66
# values to symbolic values.
65
67
self .symbolic_values = \
66
68
[ None ] * (max (self .numeric_value_table .values ()) + 1 )
67
- for k ,v in self .numeric_value_table .iteritems ():
69
+ for k ,v in self .numeric_value_table .items ():
68
70
self .symbolic_values [v ] = k
69
71
70
72
# Load the data file.
71
- with open (grapheme_break_property_file_name , 'rb ' ) as f :
73
+ with codecs . open (grapheme_break_property_file_name , encoding = sys . getfilesystemencoding (), errors = 'strict ' ) as f :
72
74
for line in f :
73
75
# Strip comments.
74
76
line = re .sub ('#.*' , '' , line )
@@ -329,7 +331,10 @@ def map_index(idx):
329
331
else :
330
332
return idx
331
333
332
- return map (map_index , indexes )
334
+ # NOTE: Python 2's `map` function returns a list. Where Python 3's
335
+ # `map` function returns an iterator. To work around this the
336
+ # result of the `map` is explicitly converted to a `list`.
337
+ return list (map (map_index , indexes ))
333
338
334
339
# If self.BMP_data contains identical data blocks, keep the first one,
335
340
# remove duplicates and change the indexes in self.BMP_lookup to point to
@@ -514,9 +519,9 @@ def _convert_line(line):
514
519
515
520
# Match a list of code points.
516
521
for token in line .split (" " ):
517
- if token == "÷" :
522
+ if token == u "÷" :
518
523
boundaries += [ curr_bytes ]
519
- elif token == "×" :
524
+ elif token == u "×" :
520
525
pass
521
526
else :
522
527
code_point = int (token , 16 )
@@ -529,21 +534,21 @@ def _convert_line(line):
529
534
# and test separately that we handle ill-formed UTF-8 sequences.
530
535
if code_point >= 0xd800 and code_point <= 0xdfff :
531
536
code_point = 0x200b
532
- code_point = ('\U%(cp)08x' % { 'cp' : code_point }).decode ('unicode_escape' )
533
- as_UTF8_bytes = code_point .encode ('utf8' )
534
- as_UTF8_escaped = '' .join (['\\ x%(byte)02x' % { 'byte' : ord ( byte ) } for byte in as_UTF8_bytes ])
537
+ code_point = (b '\U%(cp)08x' % { b 'cp' : code_point }).decode ('unicode_escape' , 'strict ' )
538
+ as_UTF8_bytes = bytearray ( code_point .encode ('utf8' , 'strict' ) )
539
+ as_UTF8_escaped = '' .join (['\\ x%(byte)02x' % { 'byte' : byte } for byte in as_UTF8_bytes ])
535
540
test += as_UTF8_escaped
536
541
curr_bytes += len (as_UTF8_bytes )
537
542
538
543
return (test , boundaries )
539
544
540
545
# Self-test.
541
- assert (_convert_line ('÷ 0903 × 0308 ÷ AC01 ÷ # abc' ) == ('\\ xe0\\ xa4\\ x83\\ xcc\\ x88\\ xea\\ xb0\\ x81' , [ 0 , 5 , 8 ]))
542
- assert (_convert_line ('÷ D800 ÷ # abc' ) == ('\\ xe2\\ x80\\ x8b' , [ 0 , 3 ]))
546
+ assert (_convert_line (u '÷ 0903 × 0308 ÷ AC01 ÷ # abc' ) == ('\\ xe0\\ xa4\\ x83\\ xcc\\ x88\\ xea\\ xb0\\ x81' , [ 0 , 5 , 8 ]))
547
+ assert (_convert_line (u '÷ D800 ÷ # abc' ) == ('\\ xe2\\ x80\\ x8b' , [ 0 , 3 ]))
543
548
544
549
result = []
545
550
546
- with open (grapheme_break_test_file_name , 'rb ' ) as f :
551
+ with codecs . open (grapheme_break_test_file_name , encoding = sys . getfilesystemencoding (), errors = 'strict ' ) as f :
547
552
for line in f :
548
553
test = _convert_line (line )
549
554
if test :
0 commit comments