@@ -3257,39 +3257,80 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
3257
3257
}
3258
3258
3259
3259
/*
3260
- =for apidoc bytes_to_utf8
3260
+ =for apidoc bytes_to_utf8
3261
+ =for apidoc_item bytes_to_utf8_free_me
3261
3262
3262
- Converts a string C<s> of length C<*lenp> bytes from the native encoding into
3263
- UTF-8.
3264
- Returns a pointer to the newly-created string, and sets C<*lenp> to
3265
- reflect the new length in bytes. The caller is responsible for arranging for
3266
- the memory used by this string to get freed.
3263
+ These each convert a string C<s> of length C<*lenp> bytes from the native
3264
+ encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
3265
+ the UTF-8 string, and setting C<*lenp> to its length in bytes.
3266
+
3267
+ C<bytes_to_utf8> always allocates new memory for the result, making sure it is
3268
+ NUL-terminated.
3269
+
3270
+ C<bytes_to_utf8_free_me> simply returns a pointer to the input string if the
3271
+ string's UTF-8 representation is the same as its native representation.
3272
+ Otherwise, it behaves like C<bytes_to_utf8>, returning a pointer to new memory
3273
+ containing the conversion of the input. In other words, it returns the input
3274
+ string if converting the string would be a no-op. Note that when no new string
3275
+ is allocated, the function can't add a NUL to the original string if one wasn't
3276
+ already there.
3277
+
3278
+ In both cases, the caller is responsible for arranging for any new memory to
3279
+ get freed.
3280
+
3281
+ C<bytes_to_utf8_free_me> takes an extra parameter, C<free_me> to communicate.
3282
+ to the caller that memory was allocated or not. If that parameter is NULL,
3283
+ C<bytes_to_utf8_free_me> acts identically to C<bytes_to_utf8>, always
3284
+ allocating new memory.
3285
+
3286
+ But when it is a non-NULL pointer, C<bytes_to_utf8_free_me> stores into it
3287
+ either NULL if no memory was allocated; or a pointer to that new memory. This
3288
+ allows the following convenient paradigm:
3289
+
3290
+ U8 * free_me;
3291
+ U8 converted = bytes_to_utf8_free_me(string, &len, &free_me);
3292
+
3293
+ ...
3294
+
3295
+ Safefree(free_me);
3296
+
3297
+ You don't have to know if memory was allocated or not. Just call C<Safefree>
3298
+ unconditionally. C<free_me> will contain a suitable value to pass to
3299
+ C<Safefree> for it to do the right thing, regardless.
3267
3300
3268
3301
Upon return, the number of variants in the string can be computed by
3269
3302
having saved the value of C<*lenp> before the call, and subtracting it from the
3270
3303
after-call value of C<*lenp>.
3271
3304
3272
- A C<NUL> character will be written after the end of the string.
3273
-
3274
- If you want to convert to UTF-8 from encodings other than
3275
- the native (Latin1 or EBCDIC),
3276
- see L</sv_recode_to_utf8>().
3305
+ If you want to convert to UTF-8 from encodings other than the native (Latin1 or
3306
+ EBCDIC), see L</sv_recode_to_utf8>().
3277
3307
3278
3308
=cut
3279
3309
*/
3280
3310
3281
3311
U8 *
3282
- Perl_bytes_to_utf8 (pTHX_ const U8 * s , STRLEN * lenp )
3312
+ Perl_bytes_to_utf8_free_me (pTHX_ const U8 * s , Size_t * lenp ,
3313
+ const U8 * * free_me_ptr )
3283
3314
{
3315
+ PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME ;
3316
+ PERL_UNUSED_CONTEXT ;
3317
+
3284
3318
const U8 * const send = s + (* lenp );
3319
+ const Size_t variant_count = variant_under_utf8_count (s , send );
3320
+
3321
+ /* Return the input unchanged if the flag indicates to do so, and there
3322
+ * are no characters that differ when represented in UTF-8, and the
3323
+ * original is NUL-terminated */
3324
+ if (free_me_ptr != NULL && variant_count == 0 ) {
3325
+ * free_me_ptr = NULL ;
3326
+ return (U8 * ) s ;
3327
+ }
3328
+
3285
3329
U8 * d ;
3286
3330
U8 * dst ;
3287
3331
3288
- PERL_ARGS_ASSERT_BYTES_TO_UTF8 ;
3289
- PERL_UNUSED_CONTEXT ;
3290
-
3291
3332
/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
3292
- Newx (d , (* lenp ) + variant_under_utf8_count ( s , send ) + 1 , U8 );
3333
+ Newx (d , (* lenp ) + variant_count + 1 , U8 );
3293
3334
dst = d ;
3294
3335
3295
3336
while (s < send ) {
@@ -3298,7 +3339,11 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
3298
3339
}
3299
3340
3300
3341
* d = '\0' ;
3301
- * lenp = d - dst ;
3342
+ * lenp = d - dst ;
3343
+
3344
+ if (free_me_ptr != NULL ) {
3345
+ * free_me_ptr = dst ;
3346
+ }
3302
3347
3303
3348
return dst ;
3304
3349
}
0 commit comments