Skip to content

bbStringFromUTF8Bytes optimization / bug avoidance #359

@GWRon

Description

@GWRon

Heya, the original implementation of bbStringFromUTF8Bytes is:

BBString *bbStringFromUTF8Bytes( const unsigned char *p,int n ){
	int c;
	unsigned short *d,*q;
	BBString *str;

	if( !p || n <= 0 ) return &bbEmptyString;
	
	d=(unsigned short*)malloc( n*2 );
	q=d;
	
	while( n-- && (c=*p++ & 0xff)){
		if( c<0x80 ){
			*q++=c;
		}else{
			if (!n--) break;
			int d=*p++ & 0x3f;
			if( c<0xe0 ){
				*q++=((c&31)<<6) | d;
			}else{
				if (!n--) break;
				int e=*p++ & 0x3f;
				if( c<0xf0 ){
					*q++=((c&15)<<12) | (d<<6) | e;
				}else{
					if (!n--) break;
					int f=*p++ & 0x3f;
					int v=((c&7)<<18) | (d<<12) | (e<<6) | f;
					if( v & 0xffff0000 ) {
						v -= 0x10000;
						d = ((v >> 10) & 0x7ff) + 0xd800;
						e = (v & 0x3ff) + 0xdc00;
						*q++=d;
						*q++=e;
					}else{
						*q++=v;
					}
				}
			}
		}
	}
	str=bbStringFromShorts( d,q-d );
	free( d );
	return str;
}

when passing UTF16 stuff to it, this implementation simply assumes they all just need 2 bytes.
I am not sure if it handles surrogate pairs somewhere ?

So I thought of simply allocating 4 bytes (surrogates etc), also I return if malloc fails, and I replaced the while(n-- ...) part as it could become negative:

unsigned char test[] = { 0xF0, 0x9F, 0x92 };  // Missing final byte!
bbStringFromUTF8Bytes(test, 3);

(a simpler fix could b if (n-- <= 0) break;). Anyways - my code:

BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {
    int c;
    unsigned short *d, *q;
    BBString *str;

    if (!p || n <= 0) return &bbEmptyString;

    // Allocate enough space for worst-case scenario (4 bytes UTF-8 -> 4 bytes UTF-16)
    d = (unsigned short *)malloc(n * 4);
    if (!d) return &bbEmptyString; // Handle memory allocation failure
    q = d;

    while (n > 0 && (c = *p++ & 0xff)) {
        n--;

        if (c < 0x80) {
            *q++ = c; // 1-byte sequence
        } else if (c < 0xE0) { // 2-byte sequence
            if (n <= 0) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            n--;
            *q++ = ((c & 0x1F) << 6) | d;
        } else if (c < 0xF0) { // 3-byte sequence
            if (n <= 1) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            n -= 2;
            *q++ = ((c & 0x0F) << 12) | (d << 6) | e;
        } else { // 4-byte sequence (UTF-16 surrogate pair)
            if (n <= 2) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            int f = *p++ & 0x3F;
            n -= 3;
            int v = ((c & 0x07) << 18) | (d << 12) | (e << 6) | f;
            if (v > 0x10FFFF) break; // Invalid code point
            if (v >= 0x10000) { // Surrogate pair
                v -= 0x10000;
                *q++ = (v >> 10) + 0xD800;
                *q++ = (v & 0x3FF) + 0xDC00;
            } else {
                *q++ = v;
            }
        }
    }

    // Create BBString and clean up
    str = bbStringFromShorts(d, q - d);
    free(d);
    return str;
}

Then I asked chatgpt if that was correct - and if it can be improved:

BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {
    int c;
    unsigned short *d, *q, *end;
    BBString *str;

    if (!p || n <= 0) return &bbEmptyString;

    // Initial allocation: Assume 2 bytes per character on average.
    int cap = n * 2;
    d = (unsigned short *)malloc(cap * sizeof(unsigned short));
    if (!d) return &bbEmptyString;
    q = d;
    end = d + cap;

    while (n > 0 && (c = *p++ & 0xff)) {
        n--;

        if (q >= end - 2) { // Ensure enough space, account for surrogate pairs
            int used = q - d;
            cap *= 1.5; // Expand by 1.5x
            d = (unsigned short *)realloc(d, cap * sizeof(unsigned short));
            if (!d) return &bbEmptyString;
            q = d + used;
            end = d + cap;
        }

        if (c < 0x80) {
            *q++ = c; // 1-byte sequence
        } else if (c < 0xE0) { // 2-byte sequence
            if (n <= 0) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            n--;
            *q++ = ((c & 0x1F) << 6) | d;
        } else if (c < 0xF0) { // 3-byte sequence
            if (n <= 1) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            n -= 2;
            *q++ = ((c & 0x0F) << 12) | (d << 6) | e;
        } else { // 4-byte sequence (UTF-16 surrogate pair)
            if (n <= 2) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            int f = *p++ & 0x3F;
            n -= 3;
            int v = ((c & 0x07) << 18) | (d << 12) | (e << 6) | f;
            if (v > 0x10FFFF) break; // Invalid code point

            if (v >= 0x10000) { // Surrogate pair
                v -= 0x10000;
                *q++ = (v >> 10) + 0xD800;
                *q++ = (v & 0x3FF) + 0xDC00;
            } else {
                *q++ = v;
            }
        }
    }

    // Shrink buffer to actual usage before creating the string
    int used = q - d;
    str = bbStringFromShorts(d, used);
    free(d);
    return str;
}

Image

... dunno if my code is useful (and/or if the chatgpt version is doing any better). @woollybah feel free to adopt/adjust/... or decline (because it just does stupid things) and close the issue afterwards.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions