bbStringFromUTF8Bytes optimization / bug avoidance

Heya, the original implementation of `bbStringFromUTF8Bytes` is:

```C

BBString *bbStringFromUTF8Bytes( const unsigned char *p,int n ){
	int c;
	unsigned short *d,*q;
	BBString *str;

	if( !p || n <= 0 ) return &bbEmptyString;
	
	d=(unsigned short*)malloc( n*2 );
	q=d;
	
	while( n-- && (c=*p++ & 0xff)){
		if( c<0x80 ){
			*q++=c;
		}else{
			if (!n--) break;
			int d=*p++ & 0x3f;
			if( c<0xe0 ){
				*q++=((c&31)<<6) | d;
			}else{
				if (!n--) break;
				int e=*p++ & 0x3f;
				if( c<0xf0 ){
					*q++=((c&15)<<12) | (d<<6) | e;
				}else{
					if (!n--) break;
					int f=*p++ & 0x3f;
					int v=((c&7)<<18) | (d<<12) | (e<<6) | f;
					if( v & 0xffff0000 ) {
						v -= 0x10000;
						d = ((v >> 10) & 0x7ff) + 0xd800;
						e = (v & 0x3ff) + 0xdc00;
						*q++=d;
						*q++=e;
					}else{
						*q++=v;
					}
				}
			}
		}
	}
	str=bbStringFromShorts( d,q-d );
	free( d );
	return str;
}
```

when passing UTF16 stuff to it, this implementation simply assumes they all just need 2 bytes.
I am not sure if it handles surrogate pairs somewhere ? 

So I thought of simply allocating 4 bytes (surrogates etc), also I return if malloc fails, and I replaced the `while(n-- ...)` part as it could become negative:
```
unsigned char test[] = { 0xF0, 0x9F, 0x92 };  // Missing final byte!
bbStringFromUTF8Bytes(test, 3);
```
(a simpler fix could b `if (n-- <= 0) break;`). Anyways - my code:

```C
BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {
    int c;
    unsigned short *d, *q;
    BBString *str;

    if (!p || n <= 0) return &bbEmptyString;

    // Allocate enough space for worst-case scenario (4 bytes UTF-8 -> 4 bytes UTF-16)
    d = (unsigned short *)malloc(n * 4);
    if (!d) return &bbEmptyString; // Handle memory allocation failure
    q = d;

    while (n > 0 && (c = *p++ & 0xff)) {
        n--;

        if (c < 0x80) {
            *q++ = c; // 1-byte sequence
        } else if (c < 0xE0) { // 2-byte sequence
            if (n <= 0) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            n--;
            *q++ = ((c & 0x1F) << 6) | d;
        } else if (c < 0xF0) { // 3-byte sequence
            if (n <= 1) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            n -= 2;
            *q++ = ((c & 0x0F) << 12) | (d << 6) | e;
        } else { // 4-byte sequence (UTF-16 surrogate pair)
            if (n <= 2) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            int f = *p++ & 0x3F;
            n -= 3;
            int v = ((c & 0x07) << 18) | (d << 12) | (e << 6) | f;
            if (v > 0x10FFFF) break; // Invalid code point
            if (v >= 0x10000) { // Surrogate pair
                v -= 0x10000;
                *q++ = (v >> 10) + 0xD800;
                *q++ = (v & 0x3FF) + 0xDC00;
            } else {
                *q++ = v;
            }
        }
    }

    // Create BBString and clean up
    str = bbStringFromShorts(d, q - d);
    free(d);
    return str;
}
```

Then I asked chatgpt if that was correct - and if it can be improved:
```C
BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {
    int c;
    unsigned short *d, *q, *end;
    BBString *str;

    if (!p || n <= 0) return &bbEmptyString;

    // Initial allocation: Assume 2 bytes per character on average.
    int cap = n * 2;
    d = (unsigned short *)malloc(cap * sizeof(unsigned short));
    if (!d) return &bbEmptyString;
    q = d;
    end = d + cap;

    while (n > 0 && (c = *p++ & 0xff)) {
        n--;

        if (q >= end - 2) { // Ensure enough space, account for surrogate pairs
            int used = q - d;
            cap *= 1.5; // Expand by 1.5x
            d = (unsigned short *)realloc(d, cap * sizeof(unsigned short));
            if (!d) return &bbEmptyString;
            q = d + used;
            end = d + cap;
        }

        if (c < 0x80) {
            *q++ = c; // 1-byte sequence
        } else if (c < 0xE0) { // 2-byte sequence
            if (n <= 0) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            n--;
            *q++ = ((c & 0x1F) << 6) | d;
        } else if (c < 0xF0) { // 3-byte sequence
            if (n <= 1) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            n -= 2;
            *q++ = ((c & 0x0F) << 12) | (d << 6) | e;
        } else { // 4-byte sequence (UTF-16 surrogate pair)
            if (n <= 2) break; // Incomplete sequence
            int d = *p++ & 0x3F;
            int e = *p++ & 0x3F;
            int f = *p++ & 0x3F;
            n -= 3;
            int v = ((c & 0x07) << 18) | (d << 12) | (e << 6) | f;
            if (v > 0x10FFFF) break; // Invalid code point

            if (v >= 0x10000) { // Surrogate pair
                v -= 0x10000;
                *q++ = (v >> 10) + 0xD800;
                *q++ = (v & 0x3FF) + 0xDC00;
            } else {
                *q++ = v;
            }
        }
    }

    // Shrink buffer to actual usage before creating the string
    int used = q - d;
    str = bbStringFromShorts(d, used);
    free(d);
    return str;
}
```

![Image](https://github.com/user-attachments/assets/efd83e92-a013-4a7b-9ad2-5408b2c2f51e)


... dunno if my code is useful (and/or if the chatgpt version is doing any better). @woollybah feel free to adopt/adjust/... or decline (because it just does stupid things) and close the issue afterwards.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

bbStringFromUTF8Bytes optimization / bug avoidance #359

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

bbStringFromUTF8Bytes optimization / bug avoidance #359

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions