Merge pull request #806 from RLovelett/gyb-python-3

Dave Abrahams · Dave Abrahams · commit b50b5419be1d · 2015-12-31T13:58:54.000-08:00
[gyb] Python 2 or 3 compatible Generate Your Boilerplate
diff --git a/lib/ClangImporter/SortedCFDatabase.def.gyb b/lib/ClangImporter/SortedCFDatabase.def.gyb
@@ -17,6 +17,8 @@
 %{
 
 import re
+import sys
+import codecs
 
 prologueLines = ""
 epilogueLines = ""
@@ -26,7 +28,7 @@ epilogueLines = ""
 lineForName = {}
 
 # Load the data file.
-with open(CFDatabaseFile, 'rb') as f:
+with codecs.open(CFDatabaseFile, encoding=sys.getfilesystemencoding(), errors='strict') as f:
   for line in f:
     # Pass through preprocessor directives literally.
     # Assume that they all fall into either a strict prologue or epilogue.
diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py
@@ -11,6 +11,8 @@
 ##===----------------------------------------------------------------------===##
 
 import re
+import sys
+import codecs
 
 class UnicodeProperty(object):
     """Abstract base class for Unicode properties."""
@@ -64,11 +66,11 @@ def __init__(self, grapheme_break_property_file_name):
         # values to symbolic values.
         self.symbolic_values = \
             [ None ] * (max(self.numeric_value_table.values()) + 1)
-        for k,v in self.numeric_value_table.iteritems():
+        for k,v in self.numeric_value_table.items():
             self.symbolic_values[v] = k
 
         # Load the data file.
-        with open(grapheme_break_property_file_name, 'rb') as f:
+        with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
             for line in f:
                 # Strip comments.
                 line = re.sub('#.*', '', line)
@@ -329,7 +331,10 @@ def map_index(idx):
                 else:
                     return idx
 
-            return map(map_index, indexes)
+            # NOTE: Python 2's `map` function returns a list. Where Python 3's
+            # `map` function returns an iterator. To work around this the
+            # result of the `map` is explicitly converted to a `list`.
+            return list(map(map_index, indexes))
 
         # If self.BMP_data contains identical data blocks, keep the first one,
         # remove duplicates and change the indexes in self.BMP_lookup to point to
@@ -514,9 +519,9 @@ def _convert_line(line):
 
         # Match a list of code points.
         for token in line.split(" "):
-            if token == "÷":
+            if token == u"÷":
                 boundaries += [ curr_bytes ]
-            elif token == "×":
+            elif token == u"×":
                 pass
             else:
                 code_point = int(token, 16)
@@ -529,21 +534,21 @@ def _convert_line(line):
                 # and test separately that we handle ill-formed UTF-8 sequences.
                 if code_point >= 0xd800 and code_point <= 0xdfff:
                     code_point = 0x200b
-                code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
-                as_UTF8_bytes = code_point.encode('utf8')
-                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
+                code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
+                as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
+                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
                 test += as_UTF8_escaped
                 curr_bytes += len(as_UTF8_bytes)
 
         return (test, boundaries)
 
     # Self-test.
-    assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
-    assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
+    assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
+    assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
 
     result = []
 
-    with open(grapheme_break_test_file_name, 'rb') as f:
+    with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
         for line in f:
             test = _convert_line(line)
             if test:
diff --git a/utils/gyb.py b/utils/gyb.py
@@ -5,7 +5,10 @@
 from __future__ import print_function
 
 import re
-from cStringIO import StringIO
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from io import StringIO
 import tokenize
 import textwrap
 from bisect import bisect
@@ -135,7 +138,8 @@ def tokenizePythonToUnmatchedCloseCurly(sourceText, start, lineStarts):
                 if nesting < 0:
                     return tokenPosToIndex(tokenStart, start, lineStarts)
 
-    except tokenize.TokenError, (message, errorPos):
+    except tokenize.TokenError as error:
+        (message, errorPos) = error.args
         return tokenPosToIndex(errorPos, start, lineStarts)
 
     return len(sourceText)
@@ -304,7 +308,7 @@ def splitGybLines(sourceLines):
     dedents = 0
     try:
         for tokenKind, tokenText, tokenStart, (tokenEndLine, tokenEndCol), lineText \
-            in tokenize.generate_tokens(sourceLines.__iter__().next):
+            in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)):
 
             if tokenKind in (tokenize.COMMENT, tokenize.ENDMARKER): 
                 continue
@@ -324,7 +328,7 @@ def splitGybLines(sourceLines):
                 
             lastTokenText,lastTokenKind = tokenText,tokenKind
 
-    except tokenize.TokenError, (message, errorPos):
+    except tokenize.TokenError:
         return [] # Let the later compile() call report the error
 
     if lastTokenText == ':':
@@ -347,7 +351,7 @@ def codeStartsWithDedentKeyword(sourceLines):
     """
     tokenText = None
     for tokenKind, tokenText, _, _, _ \
-        in tokenize.generate_tokens(sourceLines.__iter__().next):
+        in tokenize.generate_tokens(lambda i = iter(sourceLines): next(i)):
 
         if tokenKind != tokenize.COMMENT and tokenText.strip() != '':
             break
diff --git a/utils/line-directive b/utils/line-directive
@@ -71,7 +71,10 @@ def run():
         sources = sys.argv[1:dashes]
 
         command = subprocess.Popen(
-            sys.argv[dashes + 1:], stderr = subprocess.STDOUT, stdout = subprocess.PIPE
+            sys.argv[dashes + 1:],
+            stderr = subprocess.STDOUT,
+            stdout = subprocess.PIPE,
+            universal_newlines = True
         )
         
         error_pattern = re.compile(