1
- #!/usr/bin/env python2
1
+ #!/usr/bin/env python3
2
2
3
3
# This file builds a mapping of utility tables for handling UTR#46 IDNA
4
4
# processing. You may find the rules here:
5
5
# <http://www.unicode.org/reports/tr46/index.html>.
6
6
7
+ from __future__ import division
8
+ from past .builtins import cmp
9
+ from future import standard_library
10
+
11
+ standard_library .install_aliases ()
12
+ from builtins import chr
13
+ from builtins import str
14
+ from builtins import map
15
+ from builtins import range
16
+ from past .utils import old_div
17
+ from builtins import object
18
+ from functools import cmp_to_key
7
19
import json
8
20
import re
9
21
import sys
10
- import urllib2
22
+ import urllib . request , urllib . error , urllib . parse
11
23
from functools import reduce
12
24
import struct
25
+ from pprint import pprint
13
26
14
27
# NUM_UCHAR is the number of Unicode characters there are.
15
28
NUM_UCHAR = 0x10FFFF + 1
16
29
17
30
18
31
def download_unicode (version ):
19
- idna_tables = "http://www.unicode.org/Public/idna/" + version
20
- infd = urllib2 .urlopen (idna_tables + "/IdnaTestV2.txt" )
21
- # with open("test/test-idna2.js", "w") as outfd:
22
- # build_test_code(infd, outfd)
23
- with open ("test/IdnaTest.txt" , "w" ) as outfd :
24
- for line in infd :
25
- outfd .write (line )
26
- infd .close ()
27
- infd = urllib2 .urlopen (idna_tables + "/IdnaMappingTable.txt" )
28
- dgc = urllib2 .urlopen (
29
- "http://www.unicode.org/Public/"
30
- + version
31
- + "/ucd/extracted/DerivedGeneralCategory.txt"
32
+ print ("Resource Files from www.unicode.org ..." )
33
+ uribase = "http://www.unicode.org/Public/"
34
+ idna_tables = uribase + "idna/" + version
35
+ print ("... " + idna_tables + "/IdnaTestV2.txt" )
36
+ urllib .request .urlretrieve (idna_tables + "/IdnaTestV2.txt" , "test/IdnaTest.txt" )
37
+ infd = urllib .request .urlopen (idna_tables + "/IdnaMappingTable.txt" )
38
+ dgc = urllib .request .urlopen (
39
+ uribase + version + "/ucd/extracted/DerivedGeneralCategory.txt"
32
40
)
41
+ print ("... " + idna_tables + "/IdnaMappingTable.txt" )
42
+ print ("... " + uribase + version + "/ucd/extracted/DerivedGeneralCategory.txt\n " )
33
43
with open ("idna-map.js" , "w" ) as outfd :
34
44
build_unicode_map (infd , outfd , dgc )
35
45
infd .close ()
@@ -41,7 +51,9 @@ def parse_unicode_data_file(fd):
41
51
of columns, where the first column is either a single element or a range of
42
52
characters. In this case, the range implied by start and end are
43
53
inclusive."""
54
+ # data = fd.read() # .decode("utf-8")
44
55
for line in fd :
56
+ line = line .decode ("utf-8" )
45
57
pos = line .find ("#" )
46
58
if pos >= 0 :
47
59
line = line [:pos ]
@@ -50,7 +62,7 @@ def parse_unicode_data_file(fd):
50
62
continue
51
63
parts = [p .strip () for p in line .split (";" )]
52
64
53
- stend = map ( lambda x : int (x , 16 ), parts [0 ].split (".." ))
65
+ stend = [ int (x , 16 ) for x in parts [0 ].split (".." )]
54
66
if len (stend ) == 1 :
55
67
start = end = stend [0 ]
56
68
else :
@@ -65,7 +77,7 @@ def utf16len(string):
65
77
66
78
def unichar (i ):
67
79
try :
68
- return unichr (i )
80
+ return chr (i )
69
81
except ValueError :
70
82
return struct .pack ("i" , i ).decode ("utf-32" )
71
83
@@ -76,9 +88,7 @@ def __init__(self, parts):
76
88
self .rule = parts [0 ]
77
89
# If there are two parts, the second part is the mapping in question.
78
90
if len (parts ) > 1 and parts [1 ]:
79
- self .chars = "" .join (
80
- map (lambda u : unichar (int (u , 16 )), parts [1 ].split (" " ))
81
- )
91
+ self .chars = "" .join ([unichar (int (u , 16 )) for u in parts [1 ].split (" " )])
82
92
else :
83
93
self .chars = ""
84
94
@@ -97,7 +107,7 @@ def build_map_string(self, string):
97
107
self .index = utf16len (string )
98
108
string = string + self .chars
99
109
else :
100
- self .index = utf16len (string [0 : self .index ])
110
+ self .index = utf16len (string [0 : self .index ])
101
111
return string
102
112
103
113
def build_int (self ):
@@ -124,26 +134,32 @@ def build_int(self):
124
134
125
135
126
136
def build_unicode_map (idnaMapTable , out , derivedGeneralCategory ):
137
+ print ("Build Unicode Map" )
127
138
unicharMap = [0 ] * NUM_UCHAR
128
139
vals = []
140
+ print ("... parse unicode data file (IdnaMappingTable.txt)" )
129
141
for start , end , parts in parse_unicode_data_file (idnaMapTable ):
130
142
for ch in range (start , end + 1 ):
131
143
value = MappedValue (parts )
132
144
vals .append (value )
133
145
unicharMap [ch ] = value
134
146
135
147
# Note which characters have the combining mark property.
148
+ print ("... parse unicode data file (DerivedGeneralCategory.txt)" )
136
149
for start , end , parts in parse_unicode_data_file (derivedGeneralCategory ):
137
150
if parts [0 ] in ("Mc" , "Mn" , "Me" ):
138
151
for ch in range (start , end + 1 ):
139
152
unicharMap [ch ].flags |= 2
140
153
154
+ print ("... build up internal unicharMap" )
141
155
# Build up the string to use to map the output
142
- vals .sort (cmp = lambda x , y : cmp (len (x .chars ), len (y .chars )), reverse = True )
156
+ vals .sort (
157
+ key = cmp_to_key (lambda x , y : cmp (len (x .chars ), len (y .chars ))), reverse = True
158
+ )
143
159
mappedStr = reduce (lambda s , v : v .build_map_string (s ), vals , "" )
144
160
145
161
# Convert this to integers
146
- unicharMap = map ( lambda v : v .build_int (), unicharMap )
162
+ unicharMap = [ v .build_int () for v in unicharMap ]
147
163
148
164
# We're going to do a funky special case here. Since planes 3-17 are
149
165
# basically unused, we're going to divert these from the standard two-phase
@@ -157,8 +173,11 @@ def build_unicode_map(idnaMapTable, out, derivedGeneralCategory):
157
173
unicharMap [ch ] == specialCase and (0xE0100 <= ch and ch <= 0xE01EF )
158
174
)
159
175
160
- mem , lg_block_size , blocks = min (find_block_sizes (unicharMap [:0x3134B ]))
161
- block_size = 1 << lg_block_size
176
+ print ("... generate source file (idna-map.js)" )
177
+ memUsage , lg_block_size , blocks = min (
178
+ find_block_sizes (unicharMap [:0x3134B ]), key = lambda t : t [0 ]
179
+ )
180
+ block_size = 1 << lg_block_size # lg_block_size
162
181
blocks = list (blocks )
163
182
out .write ("/* This file is generated from the Unicode IDNA table, using\n " )
164
183
out .write (" the build-unicode-tables.py script. Please edit that\n " )
@@ -184,22 +203,17 @@ def build_unicode_map(idnaMapTable, out, derivedGeneralCategory):
184
203
out .write ("];\n " )
185
204
186
205
# Now emit the block index map
187
- out .write (
188
- "var blockIdxes = new Uint%dArray([" %
189
- (8 if len (blocks ) < 256 else 16 ))
206
+ out .write ("var blockIdxes = new Uint%dArray([" % (8 if len (blocks ) < 256 else 16 ))
190
207
out .write (
191
208
"," .join (
192
- str (blocks .index (tuple (unicharMap [i : i + block_size ])))
209
+ str (blocks .index (tuple (unicharMap [i : i + block_size ])))
193
210
for i in range (0 , 0x30000 , block_size )
194
211
)
195
212
)
196
213
out .write ("]);\n " )
197
214
198
215
# And the string
199
- out .write (
200
- "var mappingStr = %s;\n "
201
- % json .dumps (mappedStr , ensure_ascii = False ).encode ("utf-8" )
202
- )
216
+ out .write ("var mappingStr = %s;\n " % json .dumps (mappedStr , ensure_ascii = False ))
203
217
204
218
# Finish off with the function to actually look everything up
205
219
out .write (
@@ -243,13 +257,13 @@ def find_block_sizes(unicharMap):
243
257
def compute_block_size (unicharMap , block_size ):
244
258
blocks = set ()
245
259
for i in range (0 , len (unicharMap ), block_size ):
246
- block = tuple (unicharMap [i : i + block_size ])
260
+ block = tuple (unicharMap [i : i + block_size ])
247
261
blocks .add (block )
248
262
num = len (blocks )
249
263
if num < 256 :
250
- mem = len (unicharMap ) / block_size
264
+ mem = old_div ( len (unicharMap ), block_size )
251
265
elif num < 0x10000 :
252
- mem = 2 * len (unicharMap ) / block_size
266
+ mem = old_div ( 2 * len (unicharMap ), block_size )
253
267
else :
254
268
raise Exception ("Way too many blocks: %d" % num )
255
269
mem += num * block_size * 4
@@ -273,21 +287,21 @@ def build_body(mode, test_vector, func, expected):
273
287
return []
274
288
if mode == "T" or mode == "B" :
275
289
lines .append (
276
- 'assert.throws(function () { %s("%s", true); });' %
277
- ( func , test_vector ) )
290
+ 'assert.throws(function () { %s("%s", true); });' % ( func , test_vector )
291
+ )
278
292
if mode == "N" or mode == "B" :
279
293
lines .append (
280
- 'assert.throws(function () { %s("%s", false); });' %
281
- ( func , test_vector ) )
294
+ 'assert.throws(function () { %s("%s", false); });' % ( func , test_vector )
295
+ )
282
296
else :
283
297
if mode == "T" or mode == "B" :
284
298
lines .append (
285
- 'assert.equal(%s("%s", true), "%s");' %
286
- ( func , test_vector , expected ) )
299
+ 'assert.equal(%s("%s", true), "%s");' % ( func , test_vector , expected )
300
+ )
287
301
if mode == "N" or mode == "B" :
288
302
lines .append (
289
- 'assert.equal(%s("%s", false), "%s");' %
290
- ( func , test_vector , expected ) )
303
+ 'assert.equal(%s("%s", false), "%s");' % ( func , test_vector , expected )
304
+ )
291
305
292
306
return lines
293
307
@@ -313,7 +327,7 @@ def build_test_code(infd, out):
313
327
line = line .split ("#" )[0 ].strip ()
314
328
if not line :
315
329
continue
316
- strings = map ( lambda x : x .strip (), line .split (";" ))
330
+ strings = [ x .strip () for x in line .split (";" )]
317
331
mode = strings [0 ]
318
332
test_vector = convert_escape (strings [1 ])
319
333
unicode_data = convert_escape (strings [2 ]) or test_vector
0 commit comments