-
Notifications
You must be signed in to change notification settings - Fork 12
Open
Description
Heya, the original implementation of bbStringFromUTF8Bytes
is:
BBString *bbStringFromUTF8Bytes( const unsigned char *p,int n ){
int c;
unsigned short *d,*q;
BBString *str;
if( !p || n <= 0 ) return &bbEmptyString;
d=(unsigned short*)malloc( n*2 );
q=d;
while( n-- && (c=*p++ & 0xff)){
if( c<0x80 ){
*q++=c;
}else{
if (!n--) break;
int d=*p++ & 0x3f;
if( c<0xe0 ){
*q++=((c&31)<<6) | d;
}else{
if (!n--) break;
int e=*p++ & 0x3f;
if( c<0xf0 ){
*q++=((c&15)<<12) | (d<<6) | e;
}else{
if (!n--) break;
int f=*p++ & 0x3f;
int v=((c&7)<<18) | (d<<12) | (e<<6) | f;
if( v & 0xffff0000 ) {
v -= 0x10000;
d = ((v >> 10) & 0x7ff) + 0xd800;
e = (v & 0x3ff) + 0xdc00;
*q++=d;
*q++=e;
}else{
*q++=v;
}
}
}
}
}
str=bbStringFromShorts( d,q-d );
free( d );
return str;
}
when passing UTF16 stuff to it, this implementation simply assumes they all just need 2 bytes.
I am not sure if it handles surrogate pairs somewhere ?
So I thought of simply allocating 4 bytes (surrogates etc), also I return if malloc fails, and I replaced the while(n-- ...)
part as it could become negative:
unsigned char test[] = { 0xF0, 0x9F, 0x92 }; // Missing final byte!
bbStringFromUTF8Bytes(test, 3);
(a simpler fix could b if (n-- <= 0) break;
). Anyways - my code:
BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {
int c;
unsigned short *d, *q;
BBString *str;
if (!p || n <= 0) return &bbEmptyString;
// Allocate enough space for worst-case scenario (4 bytes UTF-8 -> 4 bytes UTF-16)
d = (unsigned short *)malloc(n * 4);
if (!d) return &bbEmptyString; // Handle memory allocation failure
q = d;
while (n > 0 && (c = *p++ & 0xff)) {
n--;
if (c < 0x80) {
*q++ = c; // 1-byte sequence
} else if (c < 0xE0) { // 2-byte sequence
if (n <= 0) break; // Incomplete sequence
int d = *p++ & 0x3F;
n--;
*q++ = ((c & 0x1F) << 6) | d;
} else if (c < 0xF0) { // 3-byte sequence
if (n <= 1) break; // Incomplete sequence
int d = *p++ & 0x3F;
int e = *p++ & 0x3F;
n -= 2;
*q++ = ((c & 0x0F) << 12) | (d << 6) | e;
} else { // 4-byte sequence (UTF-16 surrogate pair)
if (n <= 2) break; // Incomplete sequence
int d = *p++ & 0x3F;
int e = *p++ & 0x3F;
int f = *p++ & 0x3F;
n -= 3;
int v = ((c & 0x07) << 18) | (d << 12) | (e << 6) | f;
if (v > 0x10FFFF) break; // Invalid code point
if (v >= 0x10000) { // Surrogate pair
v -= 0x10000;
*q++ = (v >> 10) + 0xD800;
*q++ = (v & 0x3FF) + 0xDC00;
} else {
*q++ = v;
}
}
}
// Create BBString and clean up
str = bbStringFromShorts(d, q - d);
free(d);
return str;
}
Then I asked chatgpt if that was correct - and if it can be improved:
BBString *bbStringFromUTF8Bytes(const unsigned char *p, int n) {
int c;
unsigned short *d, *q, *end;
BBString *str;
if (!p || n <= 0) return &bbEmptyString;
// Initial allocation: Assume 2 bytes per character on average.
int cap = n * 2;
d = (unsigned short *)malloc(cap * sizeof(unsigned short));
if (!d) return &bbEmptyString;
q = d;
end = d + cap;
while (n > 0 && (c = *p++ & 0xff)) {
n--;
if (q >= end - 2) { // Ensure enough space, account for surrogate pairs
int used = q - d;
cap *= 1.5; // Expand by 1.5x
d = (unsigned short *)realloc(d, cap * sizeof(unsigned short));
if (!d) return &bbEmptyString;
q = d + used;
end = d + cap;
}
if (c < 0x80) {
*q++ = c; // 1-byte sequence
} else if (c < 0xE0) { // 2-byte sequence
if (n <= 0) break; // Incomplete sequence
int d = *p++ & 0x3F;
n--;
*q++ = ((c & 0x1F) << 6) | d;
} else if (c < 0xF0) { // 3-byte sequence
if (n <= 1) break; // Incomplete sequence
int d = *p++ & 0x3F;
int e = *p++ & 0x3F;
n -= 2;
*q++ = ((c & 0x0F) << 12) | (d << 6) | e;
} else { // 4-byte sequence (UTF-16 surrogate pair)
if (n <= 2) break; // Incomplete sequence
int d = *p++ & 0x3F;
int e = *p++ & 0x3F;
int f = *p++ & 0x3F;
n -= 3;
int v = ((c & 0x07) << 18) | (d << 12) | (e << 6) | f;
if (v > 0x10FFFF) break; // Invalid code point
if (v >= 0x10000) { // Surrogate pair
v -= 0x10000;
*q++ = (v >> 10) + 0xD800;
*q++ = (v & 0x3FF) + 0xDC00;
} else {
*q++ = v;
}
}
}
// Shrink buffer to actual usage before creating the string
int used = q - d;
str = bbStringFromShorts(d, used);
free(d);
return str;
}
... dunno if my code is useful (and/or if the chatgpt version is doing any better). @woollybah feel free to adopt/adjust/... or decline (because it just does stupid things) and close the issue afterwards.
Metadata
Metadata
Assignees
Labels
No labels