-
Notifications
You must be signed in to change notification settings - Fork 8
utf 8 decoding and encoding
Ali Rizvi-Santiago edited this page Nov 18, 2022
·
1 revision
co-worker linked Ivan Fratric's presentation here (https://www.youtube.com/watch?v=ERaRNsvCBrw) which inspired me to implement utf-8 really quick. prolly not useful.
class ones(pbinary.terminatedarray):
'''Just an array that terminates when a 0-bit is encountered'''
length, _object_ = 4, 1
def isTerminator(self, bit):
return bit == 0
class firstbyte(pbinary.struct):
'''just a structure that should always have a zero-bit and pads itself to 8-bits'''
def __codepoint(self):
count = self['count']
assert(len(count) < 5)
realcount = len(count) + 1 # add the self['zero'] field since it always exists
return 8 - realcount
_fields_ = [
(ones, 'count'), # 1111
(1, 'zero'), # 0
(__codepoint, 'codepoint'),
]
class restbyte(pbinary.struct):
'''each other byte'''
_fields_ = [
(2, 'one-oh'),
(6, 'codepoint'),
]
class rest_of_points(pbinary.array):
'''just an array of the "other" bytes'''
_object_ = restbyte
class utf8char(pbinary.struct):
'''the character that checks the first field to determine the length of the array that follows it'''
def __rest(self):
first = self['first']
count = first.get_number_bytes()
class rest_of_bytes(rest_of_points):
'''rest_of_bytes = dyn.clone(rest_of_points, length=first.get_number_bytes())'''
# subtract 1, for the first byte
rest_of_bytes.length = first.get_number_bytes() - 1
return rest_of_bytes
_fields_ = [
(firstbyte, 'first'),
(__rest, 'rest'),
]
A test to make sure it decodes okay.
x = pbinary.new(utf8char, source=ptypes.prov.bytes(u'\u10ffff'.encode('utf-8')))
x=x.l
print(x['first'])
for item in x['rest']: print(item)
print(x)
source = ptypes.provider.bytes(bytearray([0b11110111, 0b10000000, 0b10000000, 0b10000000]))
x = pbinary.new(utf8char, source=source).l
print(x['first'])
for item in x['rest']: print(item)
print(x)
source = ptypes.provider.bytes(bytearray([0xF0, 0x82,0x82,0xAC]))
x = pbinary.new(utf8char, source=source)
print(x.l)
print(x['first'])
for item in x['rest']: print(item)
Now to add some methods to the definitions so that we can encode utf-8 too.
class firstbyte(firstbyte):
def valid(self):
'''this bit should always be zero'''
return self['zero'] == 0
def get_number_bytes(self):
'''return the number of bytes this codepoint represents by counting the number of 1s in the "count" field'''
if self['zero'] != 0:
print('this utf-8 byte is actually busted...returning 1')
return 1
if len(self['count']) <= 1:
print("this utf-8 byte has a busted prefix ({:b})...returning 1".format(self['count']))
print("{:b}".format(41))
return len(self['count'])
def point(self):
'''return a tuple for the codepoint and its size'''
size = 8 - (1 + len(self['count']))
return self['codepoint'], size
class restbyte(restbyte):
def point(self):
'''returns the codepoint and its size (always 6)'''
return self['codepoint'], 6
class rest_of_points(rest_of_points):
_object_ = restbyte
class utf8char(pbinary.struct):
'''since we-redefined our type (firstbyte), we need to re-assign them into these fields'''
def __rest(self):
first = self['first']
count = first.get_number_bytes()
class rest_of_bytes(rest_of_points):
'''rest_of_bytes = dyn.clone(rest_of_points, length=first.get_number_bytes())'''
# subtract 1, for the first byte
rest_of_bytes.length = first.get_number_bytes() - 1
return rest_of_bytes
_fields_ = [
(firstbyte, 'first'),
(__rest, 'rest'),
]
def point(self):
'''call .point() on the "first" field and all of the items in the "rest" field and then combine them to an integer'''
points = [self['first'].point()] + [item.point() for item in self['rest']]
result, bits = self['first'].point()
for item in self['rest']:
result <<= bits
integer, bits = item.point()
result |= integer
return result
def setchar(self, integer, length):
'''assign a codepoint "integer" into a utf-8 character of "length" bytes'''
assert(1 <= length <= 4)
# our "ones" field is literally an array of 1s for the desired length
count = ones().alloc([1] * length)
# now we can alloc the definition using the "ones" array we stored in "count"
first = firstbyte().alloc(count=count)
# now we can reconstruct ourself using the "first" variable which sets the length for "rest"
result = self.alloc(first=first)
# iterate through the array in "rest" and chop up 6-bits out of our "integer" for each item
for item in self['rest'][::-1]:
item.set(**{'one-oh': 0b10})
item.set(codepoint=integer & 0b00111111)
integer >>= 6
# whatever is left we can set into the "first" field
result['first'].set(codepoint=integer)
return result
x = utf8char().setchar(0x20ac, 4)
print(x['first'])
for item in x['rest']: print(item)
print(x)
print(chr(x.point()))