Skip to content

Commit 992f768

Browse files
committed
Add new function bytes_to_utf8_free_me
This is like bytes_to_utf8, but if the representation of the input string is the same in UTF-8 as it is in native format, the allocation of new memory is skipped. This presents optimization possibilities.
1 parent 3121b87 commit 992f768

File tree

5 files changed

+75
-21
lines changed

5 files changed

+75
-21
lines changed

embed.fnc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -794,8 +794,12 @@ Adp |int |bytes_cmp_utf8 |NN const U8 *b \
794794
Adp |U8 * |bytes_from_utf8|NN const U8 *s \
795795
|NN STRLEN *lenp \
796796
|NN bool *is_utf8p
797-
Adp |U8 * |bytes_to_utf8 |NN const U8 *s \
797+
Admp |U8 * |bytes_to_utf8 |NN const U8 *s \
798798
|NN STRLEN *lenp
799+
Adp |U8 * |bytes_to_utf8_free_me \
800+
|NN const U8 *s \
801+
|NN STRLEN *lenp \
802+
|NULLOK const U8 **free_me
799803
AOdp |SSize_t|call_argv |NN const char *sub_name \
800804
|I32 flags \
801805
|NN char **argv

embed.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,8 @@
155155
# define block_start(a) Perl_block_start(aTHX_ a)
156156
# define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
157157
# define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
158-
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
158+
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX,a,b)
159+
# define bytes_to_utf8_free_me(a,b,c) Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
159160
# define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv
160161
# define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c)
161162
# define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b)

proto.h

Lines changed: 5 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utf8.c

Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3257,39 +3257,80 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
32573257
}
32583258

32593259
/*
3260-
=for apidoc bytes_to_utf8
3260+
=for apidoc bytes_to_utf8
3261+
=for apidoc_item bytes_to_utf8_free_me
32613262
3262-
Converts a string C<s> of length C<*lenp> bytes from the native encoding into
3263-
UTF-8.
3264-
Returns a pointer to the newly-created string, and sets C<*lenp> to
3265-
reflect the new length in bytes. The caller is responsible for arranging for
3266-
the memory used by this string to get freed.
3263+
These each convert a string C<s> of length C<*lenp> bytes from the native
3264+
encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
3265+
the UTF-8 string, and setting C<*lenp> to its length in bytes.
3266+
3267+
C<bytes_to_utf8> always allocates new memory for the result, making sure it is
3268+
NUL-terminated.
3269+
3270+
C<bytes_to_utf8_free_me> simply returns a pointer to the input string if the
3271+
string's UTF-8 representation is the same as its native representation.
3272+
Otherwise, it behaves like C<bytes_to_utf8>, returning a pointer to new memory
3273+
containing the conversion of the input. In other words, it returns the input
3274+
string if converting the string would be a no-op. Note that when no new string
3275+
is allocated, the function can't add a NUL to the original string if one wasn't
3276+
already there.
3277+
3278+
In both cases, the caller is responsible for arranging for any new memory to
3279+
get freed.
3280+
3281+
C<bytes_to_utf8_free_me> takes an extra parameter, C<free_me> to communicate.
3282+
to the caller that memory was allocated or not. If that parameter is NULL,
3283+
C<bytes_to_utf8_free_me> acts identically to C<bytes_to_utf8>, always
3284+
allocating new memory.
3285+
3286+
But when it is a non-NULL pointer, C<bytes_to_utf8_free_me> stores into it
3287+
either NULL if no memory was allocated; or a pointer to that new memory. This
3288+
allows the following convenient paradigm:
3289+
3290+
U8 * free_me;
3291+
U8 converted = bytes_to_utf8_free_me(string, &len, &free_me);
3292+
3293+
...
3294+
3295+
Safefree(free_me);
3296+
3297+
You don't have to know if memory was allocated or not. Just call C<Safefree>
3298+
unconditionally. C<free_me> will contain a suitable value to pass to
3299+
C<Safefree> for it to do the right thing, regardless.
32673300
32683301
Upon return, the number of variants in the string can be computed by
32693302
having saved the value of C<*lenp> before the call, and subtracting it from the
32703303
after-call value of C<*lenp>.
32713304
3272-
A C<NUL> character will be written after the end of the string.
3273-
3274-
If you want to convert to UTF-8 from encodings other than
3275-
the native (Latin1 or EBCDIC),
3276-
see L</sv_recode_to_utf8>().
3305+
If you want to convert to UTF-8 from encodings other than the native (Latin1 or
3306+
EBCDIC), see L</sv_recode_to_utf8>().
32773307
32783308
=cut
32793309
*/
32803310

32813311
U8*
3282-
Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
3312+
Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
3313+
const U8 ** free_me_ptr)
32833314
{
3315+
PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
3316+
PERL_UNUSED_CONTEXT;
3317+
32843318
const U8 * const send = s + (*lenp);
3319+
const Size_t variant_count = variant_under_utf8_count(s, send);
3320+
3321+
/* Return the input unchanged if the flag indicates to do so, and there
3322+
* are no characters that differ when represented in UTF-8, and the
3323+
* original is NUL-terminated */
3324+
if (free_me_ptr != NULL && variant_count == 0) {
3325+
*free_me_ptr = NULL;
3326+
return (U8 *) s;
3327+
}
3328+
32853329
U8 *d;
32863330
U8 *dst;
32873331

3288-
PERL_ARGS_ASSERT_BYTES_TO_UTF8;
3289-
PERL_UNUSED_CONTEXT;
3290-
32913332
/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
3292-
Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
3333+
Newx(d, (*lenp) + variant_count + 1, U8);
32933334
dst = d;
32943335

32953336
while (s < send) {
@@ -3298,7 +3339,11 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
32983339
}
32993340

33003341
*d = '\0';
3301-
*lenp = d-dst;
3342+
*lenp = d - dst;
3343+
3344+
if (free_me_ptr != NULL) {
3345+
*free_me_ptr = dst;
3346+
}
33023347

33033348
return dst;
33043349
}

utf8.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,7 @@ point's representation.
13301330

13311331
#define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
13321332

1333+
#define Perl_bytes_to_utf8(mTHX, s, lenp) Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
13331334
typedef enum {
13341335
PL_utf8_to_bytes_overwrite = 0,
13351336
PL_utf8_to_bytes_new_memory,

0 commit comments

Comments
 (0)