Skip to content

Commit ead43c0

Browse files
chore(python3): support added for generator script (build-unicode-tables.py)
BREAKING CHANGE: Module generated using the python3 ported generator script.
1 parent 16a8ab1 commit ead43c0

5 files changed

+2472
-2349
lines changed

build-unicode-tables.py

+59-45
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,45 @@
1-
#!/usr/bin/env python2
1+
#!/usr/bin/env python3
22

33
# This file builds a mapping of utility tables for handling UTR#46 IDNA
44
# processing. You may find the rules here:
55
# <http://www.unicode.org/reports/tr46/index.html>.
66

7+
from __future__ import division
8+
from past.builtins import cmp
9+
from future import standard_library
10+
11+
standard_library.install_aliases()
12+
from builtins import chr
13+
from builtins import str
14+
from builtins import map
15+
from builtins import range
16+
from past.utils import old_div
17+
from builtins import object
18+
from functools import cmp_to_key
719
import json
820
import re
921
import sys
10-
import urllib2
22+
import urllib.request, urllib.error, urllib.parse
1123
from functools import reduce
1224
import struct
25+
from pprint import pprint
1326

1427
# NUM_UCHAR is the number of Unicode characters there are.
1528
NUM_UCHAR = 0x10FFFF + 1
1629

1730

1831
def download_unicode(version):
19-
idna_tables = "http://www.unicode.org/Public/idna/" + version
20-
infd = urllib2.urlopen(idna_tables + "/IdnaTestV2.txt")
21-
# with open("test/test-idna2.js", "w") as outfd:
22-
# build_test_code(infd, outfd)
23-
with open("test/IdnaTest.txt", "w") as outfd:
24-
for line in infd:
25-
outfd.write(line)
26-
infd.close()
27-
infd = urllib2.urlopen(idna_tables + "/IdnaMappingTable.txt")
28-
dgc = urllib2.urlopen(
29-
"http://www.unicode.org/Public/"
30-
+ version
31-
+ "/ucd/extracted/DerivedGeneralCategory.txt"
32+
print("Resource Files from www.unicode.org ...")
33+
uribase = "http://www.unicode.org/Public/"
34+
idna_tables = uribase + "idna/" + version
35+
print("... " + idna_tables + "/IdnaTestV2.txt")
36+
urllib.request.urlretrieve(idna_tables + "/IdnaTestV2.txt", "test/IdnaTest.txt")
37+
infd = urllib.request.urlopen(idna_tables + "/IdnaMappingTable.txt")
38+
dgc = urllib.request.urlopen(
39+
uribase + version + "/ucd/extracted/DerivedGeneralCategory.txt"
3240
)
41+
print("... " + idna_tables + "/IdnaMappingTable.txt")
42+
print("... " + uribase + version + "/ucd/extracted/DerivedGeneralCategory.txt\n")
3343
with open("idna-map.js", "w") as outfd:
3444
build_unicode_map(infd, outfd, dgc)
3545
infd.close()
@@ -41,7 +51,9 @@ def parse_unicode_data_file(fd):
4151
of columns, where the first column is either a single element or a range of
4252
characters. In this case, the range implied by start and end are
4353
inclusive."""
54+
# data = fd.read() # .decode("utf-8")
4455
for line in fd:
56+
line = line.decode("utf-8")
4557
pos = line.find("#")
4658
if pos >= 0:
4759
line = line[:pos]
@@ -50,7 +62,7 @@ def parse_unicode_data_file(fd):
5062
continue
5163
parts = [p.strip() for p in line.split(";")]
5264

53-
stend = map(lambda x: int(x, 16), parts[0].split(".."))
65+
stend = [int(x, 16) for x in parts[0].split("..")]
5466
if len(stend) == 1:
5567
start = end = stend[0]
5668
else:
@@ -65,7 +77,7 @@ def utf16len(string):
6577

6678
def unichar(i):
6779
try:
68-
return unichr(i)
80+
return chr(i)
6981
except ValueError:
7082
return struct.pack("i", i).decode("utf-32")
7183

@@ -76,9 +88,7 @@ def __init__(self, parts):
7688
self.rule = parts[0]
7789
# If there are two parts, the second part is the mapping in question.
7890
if len(parts) > 1 and parts[1]:
79-
self.chars = "".join(
80-
map(lambda u: unichar(int(u, 16)), parts[1].split(" "))
81-
)
91+
self.chars = "".join([unichar(int(u, 16)) for u in parts[1].split(" ")])
8292
else:
8393
self.chars = ""
8494

@@ -97,7 +107,7 @@ def build_map_string(self, string):
97107
self.index = utf16len(string)
98108
string = string + self.chars
99109
else:
100-
self.index = utf16len(string[0: self.index])
110+
self.index = utf16len(string[0 : self.index])
101111
return string
102112

103113
def build_int(self):
@@ -124,26 +134,32 @@ def build_int(self):
124134

125135

126136
def build_unicode_map(idnaMapTable, out, derivedGeneralCategory):
137+
print("Build Unicode Map")
127138
unicharMap = [0] * NUM_UCHAR
128139
vals = []
140+
print("... parse unicode data file (IdnaMappingTable.txt)")
129141
for start, end, parts in parse_unicode_data_file(idnaMapTable):
130142
for ch in range(start, end + 1):
131143
value = MappedValue(parts)
132144
vals.append(value)
133145
unicharMap[ch] = value
134146

135147
# Note which characters have the combining mark property.
148+
print("... parse unicode data file (DerivedGeneralCategory.txt)")
136149
for start, end, parts in parse_unicode_data_file(derivedGeneralCategory):
137150
if parts[0] in ("Mc", "Mn", "Me"):
138151
for ch in range(start, end + 1):
139152
unicharMap[ch].flags |= 2
140153

154+
print("... build up internal unicharMap")
141155
# Build up the string to use to map the output
142-
vals.sort(cmp=lambda x, y: cmp(len(x.chars), len(y.chars)), reverse=True)
156+
vals.sort(
157+
key=cmp_to_key(lambda x, y: cmp(len(x.chars), len(y.chars))), reverse=True
158+
)
143159
mappedStr = reduce(lambda s, v: v.build_map_string(s), vals, "")
144160

145161
# Convert this to integers
146-
unicharMap = map(lambda v: v.build_int(), unicharMap)
162+
unicharMap = [v.build_int() for v in unicharMap]
147163

148164
# We're going to do a funky special case here. Since planes 3-17 are
149165
# basically unused, we're going to divert these from the standard two-phase
@@ -157,8 +173,11 @@ def build_unicode_map(idnaMapTable, out, derivedGeneralCategory):
157173
unicharMap[ch] == specialCase and (0xE0100 <= ch and ch <= 0xE01EF)
158174
)
159175

160-
mem, lg_block_size, blocks = min(find_block_sizes(unicharMap[:0x3134B]))
161-
block_size = 1 << lg_block_size
176+
print("... generate source file (idna-map.js)")
177+
memUsage, lg_block_size, blocks = min(
178+
find_block_sizes(unicharMap[:0x3134B]), key=lambda t: t[0]
179+
)
180+
block_size = 1 << lg_block_size # lg_block_size
162181
blocks = list(blocks)
163182
out.write("/* This file is generated from the Unicode IDNA table, using\n")
164183
out.write(" the build-unicode-tables.py script. Please edit that\n")
@@ -184,22 +203,17 @@ def build_unicode_map(idnaMapTable, out, derivedGeneralCategory):
184203
out.write("];\n")
185204

186205
# Now emit the block index map
187-
out.write(
188-
"var blockIdxes = new Uint%dArray([" %
189-
(8 if len(blocks) < 256 else 16))
206+
out.write("var blockIdxes = new Uint%dArray([" % (8 if len(blocks) < 256 else 16))
190207
out.write(
191208
",".join(
192-
str(blocks.index(tuple(unicharMap[i: i + block_size])))
209+
str(blocks.index(tuple(unicharMap[i : i + block_size])))
193210
for i in range(0, 0x30000, block_size)
194211
)
195212
)
196213
out.write("]);\n")
197214

198215
# And the string
199-
out.write(
200-
"var mappingStr = %s;\n"
201-
% json.dumps(mappedStr, ensure_ascii=False).encode("utf-8")
202-
)
216+
out.write("var mappingStr = %s;\n" % json.dumps(mappedStr, ensure_ascii=False))
203217

204218
# Finish off with the function to actually look everything up
205219
out.write(
@@ -243,13 +257,13 @@ def find_block_sizes(unicharMap):
243257
def compute_block_size(unicharMap, block_size):
244258
blocks = set()
245259
for i in range(0, len(unicharMap), block_size):
246-
block = tuple(unicharMap[i: i + block_size])
260+
block = tuple(unicharMap[i : i + block_size])
247261
blocks.add(block)
248262
num = len(blocks)
249263
if num < 256:
250-
mem = len(unicharMap) / block_size
264+
mem = old_div(len(unicharMap), block_size)
251265
elif num < 0x10000:
252-
mem = 2 * len(unicharMap) / block_size
266+
mem = old_div(2 * len(unicharMap), block_size)
253267
else:
254268
raise Exception("Way too many blocks: %d" % num)
255269
mem += num * block_size * 4
@@ -273,21 +287,21 @@ def build_body(mode, test_vector, func, expected):
273287
return []
274288
if mode == "T" or mode == "B":
275289
lines.append(
276-
'assert.throws(function () { %s("%s", true); });' %
277-
(func, test_vector))
290+
'assert.throws(function () { %s("%s", true); });' % (func, test_vector)
291+
)
278292
if mode == "N" or mode == "B":
279293
lines.append(
280-
'assert.throws(function () { %s("%s", false); });' %
281-
(func, test_vector))
294+
'assert.throws(function () { %s("%s", false); });' % (func, test_vector)
295+
)
282296
else:
283297
if mode == "T" or mode == "B":
284298
lines.append(
285-
'assert.equal(%s("%s", true), "%s");' %
286-
(func, test_vector, expected))
299+
'assert.equal(%s("%s", true), "%s");' % (func, test_vector, expected)
300+
)
287301
if mode == "N" or mode == "B":
288302
lines.append(
289-
'assert.equal(%s("%s", false), "%s");' %
290-
(func, test_vector, expected))
303+
'assert.equal(%s("%s", false), "%s");' % (func, test_vector, expected)
304+
)
291305

292306
return lines
293307

@@ -313,7 +327,7 @@ def build_test_code(infd, out):
313327
line = line.split("#")[0].strip()
314328
if not line:
315329
continue
316-
strings = map(lambda x: x.strip(), line.split(";"))
330+
strings = [x.strip() for x in line.split(";")]
317331
mode = strings[0]
318332
test_vector = convert_escape(strings[1])
319333
unicode_data = convert_escape(strings[2]) or test_vector

0 commit comments

Comments
 (0)