Skip to content

utf 8 decoding and encoding

Ali Rizvi-Santiago edited this page Nov 18, 2022 · 1 revision

co-worker linked Ivan Fratric's presentation here (https://www.youtube.com/watch?v=ERaRNsvCBrw) which inspired me to implement utf-8 really quick. prolly not useful.

Definitions (decoding)

class ones(pbinary.terminatedarray):
    '''Just an array that terminates when a 0-bit is encountered'''
    length, _object_ = 4, 1
    def isTerminator(self, bit):
        return bit == 0

class firstbyte(pbinary.struct):
    '''just a structure that should always have a zero-bit and pads itself to 8-bits'''
    def __codepoint(self):
        count = self['count']
        assert(len(count) < 5)
        realcount = len(count) + 1       # add the self['zero'] field since it always exists
        return 8 - realcount
    _fields_ = [
        (ones, 'count'),        # 1111
        (1, 'zero'),            # 0
        (__codepoint, 'codepoint'),
    ]

class restbyte(pbinary.struct):
    '''each other byte'''
    _fields_ = [
        (2, 'one-oh'),
        (6, 'codepoint'),
    ]

class rest_of_points(pbinary.array):
    '''just an array of the "other" bytes'''
    _object_ = restbyte

class utf8char(pbinary.struct):
    '''the character that checks the first field to determine the length of the array that follows it'''
    def __rest(self):
        first = self['first']
        count = first.get_number_bytes()

        class rest_of_bytes(rest_of_points):
            '''rest_of_bytes = dyn.clone(rest_of_points, length=first.get_number_bytes())'''

        # subtract 1, for the first byte
        rest_of_bytes.length = first.get_number_bytes() - 1
        return rest_of_bytes
        
    _fields_ = [
        (firstbyte, 'first'),
        (__rest, 'rest'),
    ]

Decoding using those definitions

A test to make sure it decodes okay.

x = pbinary.new(utf8char, source=ptypes.prov.bytes(u'\u10ffff'.encode('utf-8')))
x=x.l
print(x['first'])
for item in x['rest']: print(item)
print(x)

source = ptypes.provider.bytes(bytearray([0b11110111, 0b10000000, 0b10000000, 0b10000000]))
x = pbinary.new(utf8char, source=source).l
print(x['first'])
for item in x['rest']: print(item)
print(x)

source = ptypes.provider.bytes(bytearray([0xF0, 0x82,0x82,0xAC]))
x = pbinary.new(utf8char, source=source)
print(x.l)
print(x['first'])
for item in x['rest']: print(item)

Deinitions (Encoding)

Now to add some methods to the definitions so that we can encode utf-8 too.

class firstbyte(firstbyte):
    def valid(self):
        '''this bit should always be zero'''
        return self['zero'] == 0

    def get_number_bytes(self):
        '''return the number of bytes this codepoint represents by counting the number of 1s in the "count" field'''
        if self['zero'] != 0:
            print('this utf-8 byte is actually busted...returning 1')
            return 1
        if len(self['count']) <= 1:
            print("this utf-8 byte has a busted prefix ({:b})...returning 1".format(self['count']))
            print("{:b}".format(41))
        return len(self['count'])
    def point(self):
        '''return a tuple for the codepoint and its size'''
        size = 8 - (1 + len(self['count']))
        return self['codepoint'], size

class restbyte(restbyte):
    def point(self):
        '''returns the codepoint and its size (always 6)'''
        return self['codepoint'], 6

class rest_of_points(rest_of_points):
    _object_ = restbyte

class utf8char(pbinary.struct):
    '''since we-redefined our type (firstbyte), we need to re-assign them into these fields'''
    def __rest(self):
        first = self['first']
        count = first.get_number_bytes()

        class rest_of_bytes(rest_of_points):
            '''rest_of_bytes = dyn.clone(rest_of_points, length=first.get_number_bytes())'''

        # subtract 1, for the first byte
        rest_of_bytes.length = first.get_number_bytes() - 1
        return rest_of_bytes
        
    _fields_ = [
        (firstbyte, 'first'),
        (__rest, 'rest'),
    ]

    def point(self):
        '''call .point() on the "first" field and all of the items in the "rest" field and then combine them to an integer'''
        points = [self['first'].point()] + [item.point() for item in self['rest']]
        result, bits = self['first'].point()
        for item in self['rest']:
            result <<= bits
            integer, bits = item.point()
            result |= integer
        return result

    def setchar(self, integer, length):
        '''assign a codepoint "integer" into a utf-8 character of "length" bytes'''
        assert(1 <= length <= 4)

        # our "ones" field is literally an array of 1s for the desired length
        count = ones().alloc([1] * length)

        # now we can alloc the definition using the "ones" array we stored in "count"
        first = firstbyte().alloc(count=count)

        # now we can reconstruct ourself using the "first" variable which sets the length for "rest"
        result = self.alloc(first=first)

        # iterate through the array in "rest" and chop up 6-bits out of our "integer" for each item
        for item in self['rest'][::-1]:
            item.set(**{'one-oh': 0b10})
            item.set(codepoint=integer & 0b00111111)
            integer >>= 6

        # whatever is left we can set into the "first" field
        result['first'].set(codepoint=integer)
        return result

Encoding with those definitions

x = utf8char().setchar(0x20ac, 4)
print(x['first'])
for item in x['rest']: print(item)
print(x)
print(chr(x.point()))