Skip to content

Commit c60ca96

Browse files
committed
Add two new flags to utf8_to_uv_msgs()
These new flags subsume the functionality of force_out_malformed_utf8_message(). The guts of that function are moved to this one. This commit resulted from my observation that a bunch of code that called some form of utf8_to_uv() would test the success of that, and if it failed immediately call force_out_malformed_utf8_message(). It's better to do a common paradigm in just one place, and this commit paves the way for that.
1 parent 4edac50 commit c60ca96

File tree

5 files changed

+87
-44
lines changed

5 files changed

+87
-44
lines changed

embed.fnc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,7 +1241,7 @@ p |void |force_locale_unlock
12411241
Cp |void |force_out_malformed_utf8_message_ \
12421242
|NN const U8 * const p \
12431243
|NN const U8 * const e \
1244-
|const U32 flags \
1244+
|U32 flags \
12451245
|const bool die_here
12461246
Adfpv |char * |form |NN const char *pat \
12471247
|...
@@ -3757,27 +3757,27 @@ ATdmp |bool |utf8_to_uv_errors \
37573757
|NN const U8 * const e \
37583758
|NN UV *cp_p \
37593759
|NULLOK Size_t *advance_p \
3760-
|const U32 flags \
3760+
|U32 flags \
37613761
|NULLOK U32 *errors
37623762
ATdmp |bool |utf8_to_uv_flags \
37633763
|NN const U8 * const s \
37643764
|NN const U8 * const e \
37653765
|NN UV *cp_p \
37663766
|NULLOK Size_t *advance_p \
3767-
|const U32 flag
3767+
|U32 flags
37683768
ATdip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \
37693769
|NN const U8 *e \
37703770
|NN UV *cp_p \
37713771
|NULLOK Size_t *advance_p \
3772-
|const U32 flags \
3772+
|U32 flags \
37733773
|NULLOK U32 *errors \
37743774
|NULLOK AV **msgs
37753775
CTp |bool |utf8_to_uv_msgs_helper_ \
37763776
|NN const U8 * const s0 \
37773777
|NN const U8 * const e \
37783778
|NN UV *cp_p \
37793779
|NULLOK Size_t *advance_p \
3780-
|const U32 flags \
3780+
|U32 flags \
37813781
|NULLOK U32 *errors \
37823782
|NULLOK AV **msgs
37833783
CDbdp |UV |utf8_to_uvuni |NN const U8 *s \

inline.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3053,7 +3053,7 @@ Perl_utf8_to_uv_msgs(const U8 * const s0,
30533053
const U8 * const e,
30543054
UV * cp_p,
30553055
Size_t *advance_p,
3056-
const U32 flags,
3056+
U32 flags,
30573057
U32 * errors,
30583058
AV ** msgs)
30593059
{
@@ -3142,7 +3142,7 @@ PERL_STATIC_INLINE UV
31423142
Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
31433143
STRLEN curlen,
31443144
STRLEN *retlen,
3145-
const U32 flags,
3145+
U32 flags,
31463146
U32 * errors,
31473147
AV ** msgs)
31483148
{

proto.h

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utf8.c

Lines changed: 73 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ Perl_force_out_malformed_utf8_message_(pTHX_
5151
const U8 *const p, /* First byte in UTF-8 sequence */
5252
const U8 * const e, /* Final byte in sequence (may include
5353
multiple chars */
54-
const U32 flags, /* Flags to pass to utf8_to_uv(),
54+
U32 flags, /* Flags to pass to utf8_to_uv(),
5555
usually 0, or some DISALLOW flags */
5656
const bool die_here) /* If TRUE, this function does not return */
5757
{
@@ -70,29 +70,17 @@ Perl_force_out_malformed_utf8_message_(pTHX_
7070
* flexibility is here to return to the caller so they can finish up and
7171
* die themselves */
7272
U32 errors;
73+
UV dummy;
7374

74-
ENTER;
75-
SAVEI8(PL_dowarn);
76-
SAVESPTR(PL_curcop);
77-
78-
PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
79-
if (PL_curcop) {
80-
SAVECURCOPWARNINGS();
81-
PL_curcop->cop_warnings = pWARN_ALL;
82-
}
83-
84-
(void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
85-
86-
LEAVE;
75+
flags &= ~UTF8_CHECK_ONLY;
76+
flags |= (die_here) ? UTF8_DIE_IF_MALFORMED
77+
: UTF8_FORCE_WARN_IF_MALFORMED;
78+
(void) utf8_to_uv_errors(p, e, &dummy, NULL, flags, &errors);
8779

8880
if (! errors) {
8981
Perl_croak(aTHX_ "panic: force_out_malformed_utf8_message_ should"
9082
" be called only when there are errors found");
9183
}
92-
93-
if (die_here) {
94-
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
95-
}
9684
}
9785

9886
STATIC HV *
@@ -1271,6 +1259,16 @@ C<*retlen> with the C<uvchr> family of functions (for the worse). It is not
12711259
likely to be of use to you. You can use C<UTF8_ALLOW_ANY> (described below) to
12721260
also turn off warnings, and that flag doesn't adversely affect C<*retlen>.
12731261
1262+
=item C<UTF8_FORCE_WARN_IF_MALFORMED>
1263+
1264+
Normally, no warnings are generated if warnings are turned off lexically or
1265+
globally, regardless of any flags to the contrary. But this flag effectively
1266+
turns on warnings temporarily for the duration of this function's execution.
1267+
1268+
Do not use it lightly.
1269+
1270+
This flag is ignored if C<UTF8_CHECK_ONLY> is also set.
1271+
12741272
=item C<UTF8_DISALLOW_SURROGATE>
12751273
12761274
=item C<UTF8_WARN_SURROGATE>
@@ -1365,6 +1363,14 @@ C<UTF8_ALLOW_ANY> which applies to any of the syntactic malformations and
13651363
overflow, except for empty input. The other flags are analogous to ones in
13661364
the C<_GOT_> bits list in C<L</utf8_to_uv_msgs>>.
13671365
1366+
=item C<UTF8_DIE_IF_MALFORMED>
1367+
1368+
If the function would otherwise return C<false>, it instead croaks. The
1369+
C<UTF8_FORCE_WARN_IF_MALFORMED> flag is effectively turned on so that the cause
1370+
of the croak is displayed.
1371+
1372+
This flag is ignored if C<UTF8_CHECK_ONLY> is also set.
1373+
13681374
=back
13691375
13701376
=for apidoc utf8_to_uv_msgs
@@ -1516,6 +1522,8 @@ function creates a new AV to store information, described below, about all
15161522
the malformations that were encountered.
15171523
15181524
If the flag C<UTF8_CHECK_ONLY> is passed, this parameter is ignored.
1525+
Otherwise, when this parameter is set, the flags C<UTF8_DIE_IF_MALFORMED> and
1526+
C<UTF8_FORCE_WARN_IF_MALFORMED> are ignored.
15191527
15201528
What is considered a malformation is affected by C<flags>, the same as
15211529
described in C<L</utf8_to_uv_flags>>. No array element is generated for
@@ -1592,7 +1600,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
15921600
const U8 * const e,
15931601
UV *cp_p,
15941602
Size_t *advance_p,
1595-
const U32 flags,
1603+
U32 flags,
15961604
U32 * errors,
15971605
AV ** msgs)
15981606
{
@@ -1627,6 +1635,9 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
16271635
}
16281636
if (UNLIKELY(msgs)) {
16291637
*msgs = NULL;
1638+
1639+
/* The msgs parameter has higher priority than these flags */
1640+
flags &= ~(UTF8_DIE_IF_MALFORMED|UTF8_FORCE_WARN_IF_MALFORMED);
16301641
}
16311642

16321643
/* Each of the affected Hanguls starts with \xED */
@@ -1998,12 +2009,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
19982009
* the warning category to use for the message..
19992010
*
20002011
* No message need be generated if the UTF8_CHECK_ONLY flag has been
2001-
* set by the caller. Otherwise, a message should be generated if
2002-
* either:
2012+
* set by the caller. Otherwise, a message should be generated if:
20032013
* 1) the caller has furnished a structure into which messages should
20042014
* be returned to it (so it itself can decide what to do); or
20052015
* 2) warnings are enabled for either of the category parameters to
2006-
* the macro.
2016+
* the macro; or
2017+
* 3) the special MALFORMED flags have been passed
20072018
*
20082019
* The 'warning' parameter is the higher priority warning category to
20092020
* check. The macro calls ckWARN_d(warning), so warnings for it are
@@ -2019,11 +2030,13 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
20192030
*
20202031
* When called without a second category, the macro outputs a bunch of
20212032
* zeroes that the compiler should fold to nothing */
2022-
#define NEED_MESSAGE(warning, extra_ckWARN, extra_category) \
2023-
((flags & UTF8_CHECK_ONLY) ? 0 : \
2024-
((ckWARN_d(warning)) ? warning : \
2025-
((extra_ckWARN(extra_category +0)) ? extra_category +0 : \
2026-
((msgs) ? warning : 0))))
2033+
#define NEED_MESSAGE(warning, extra_ckWARN, extra_category) \
2034+
((flags & UTF8_CHECK_ONLY) ? 0 : \
2035+
((ckWARN_d(warning)) ? warning : \
2036+
((extra_ckWARN(extra_category +0)) ? extra_category +0 : \
2037+
((flags & ( UTF8_DIE_IF_MALFORMED \
2038+
|UTF8_FORCE_WARN_IF_MALFORMED)) ? warning : \
2039+
((msgs) ? warning : 0)))))
20272040

20282041
while (possible_problems) { /* Handle each possible problem */
20292042
char * message = NULL;
@@ -2484,11 +2497,35 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
24842497
newRV_noinc((SV*) new_msg_hv(message, pack_warn,
24852498
this_flag_bit)));
24862499
}
2487-
else if (PL_op)
2488-
Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2489-
OP_DESC(PL_op));
2490-
else
2491-
Perl_warner(aTHX_ pack_warn, "%s", message);
2500+
else if (! (flags & UTF8_CHECK_ONLY)) {
2501+
if (UNLIKELY(flags & ( UTF8_DIE_IF_MALFORMED
2502+
|UTF8_FORCE_WARN_IF_MALFORMED)))
2503+
{
2504+
ENTER;
2505+
SAVEI8(PL_dowarn);
2506+
SAVESPTR(PL_curcop);
2507+
2508+
PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
2509+
if (PL_curcop) {
2510+
SAVECURCOPWARNINGS();
2511+
PL_curcop->cop_warnings = pWARN_ALL;
2512+
}
2513+
}
2514+
2515+
if (PL_op) {
2516+
Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2517+
OP_DESC(PL_op));
2518+
}
2519+
else {
2520+
Perl_warner(aTHX_ pack_warn, "%s", message);
2521+
}
2522+
2523+
if (UNLIKELY(flags & ( UTF8_DIE_IF_MALFORMED
2524+
|UTF8_FORCE_WARN_IF_MALFORMED)))
2525+
{
2526+
LEAVE;
2527+
}
2528+
}
24922529
}
24932530
} /* End of 'while (possible_problems)' */
24942531

@@ -2508,6 +2545,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
25082545
}
25092546

25102547
if (disallowed) {
2548+
if ((flags & ~UTF8_CHECK_ONLY) & UTF8_DIE_IF_MALFORMED) {
2549+
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
2550+
}
2551+
25112552
success = false;
25122553
uv = UNICODE_REPLACEMENT;
25132554
}

utf8.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1214,6 +1214,8 @@ point's representation.
12141214

12151215
#define UTF8_CHECK_ONLY 0x8000
12161216
#define UTF8_NO_CONFIDENCE_IN_CURLEN_ 0x10000 /* Internal core use only */
1217+
#define UTF8_DIE_IF_MALFORMED 0x20000
1218+
#define UTF8_FORCE_WARN_IF_MALFORMED 0x40000
12171219

12181220
/* For backwards source compatibility. They do nothing, as the default now
12191221
* includes what they used to mean. The first one's meaning was to allow the

0 commit comments

Comments
 (0)