@@ -51,7 +51,7 @@ Perl_force_out_malformed_utf8_message_(pTHX_
51
51
const U8 * const p , /* First byte in UTF-8 sequence */
52
52
const U8 * const e , /* Final byte in sequence (may include
53
53
multiple chars */
54
- const U32 flags , /* Flags to pass to utf8_to_uv(),
54
+ U32 flags , /* Flags to pass to utf8_to_uv(),
55
55
usually 0, or some DISALLOW flags */
56
56
const bool die_here ) /* If TRUE, this function does not return */
57
57
{
@@ -70,29 +70,17 @@ Perl_force_out_malformed_utf8_message_(pTHX_
70
70
* flexibility is here to return to the caller so they can finish up and
71
71
* die themselves */
72
72
U32 errors ;
73
+ UV dummy ;
73
74
74
- ENTER ;
75
- SAVEI8 (PL_dowarn );
76
- SAVESPTR (PL_curcop );
77
-
78
- PL_dowarn = G_WARN_ALL_ON |G_WARN_ON ;
79
- if (PL_curcop ) {
80
- SAVECURCOPWARNINGS ();
81
- PL_curcop -> cop_warnings = pWARN_ALL ;
82
- }
83
-
84
- (void ) utf8n_to_uvchr_error (p , e - p , NULL , flags & ~UTF8_CHECK_ONLY , & errors );
85
-
86
- LEAVE ;
75
+ flags &= ~UTF8_CHECK_ONLY ;
76
+ flags |= (die_here ) ? UTF8_DIE_IF_MALFORMED
77
+ : UTF8_FORCE_WARN_IF_MALFORMED ;
78
+ (void ) utf8_to_uv_errors (p , e , & dummy , NULL , flags , & errors );
87
79
88
80
if (! errors ) {
89
81
Perl_croak (aTHX_ "panic: force_out_malformed_utf8_message_ should"
90
82
" be called only when there are errors found" );
91
83
}
92
-
93
- if (die_here ) {
94
- Perl_croak (aTHX_ "Malformed UTF-8 character (fatal)" );
95
- }
96
84
}
97
85
98
86
STATIC HV *
@@ -1271,6 +1259,16 @@ C<*retlen> with the C<uvchr> family of functions (for the worse). It is not
1271
1259
likely to be of use to you. You can use C<UTF8_ALLOW_ANY> (described below) to
1272
1260
also turn off warnings, and that flag doesn't adversely affect C<*retlen>.
1273
1261
1262
+ =item C<UTF8_FORCE_WARN_IF_MALFORMED>
1263
+
1264
+ Normally, no warnings are generated if warnings are turned off lexically or
1265
+ globally, regardless of any flags to the contrary. But this flag effectively
1266
+ turns on warnings temporarily for the duration of this function's execution.
1267
+
1268
+ Do not use it lightly.
1269
+
1270
+ This flag is ignored if C<UTF8_CHECK_ONLY> is also set.
1271
+
1274
1272
=item C<UTF8_DISALLOW_SURROGATE>
1275
1273
1276
1274
=item C<UTF8_WARN_SURROGATE>
@@ -1365,6 +1363,14 @@ C<UTF8_ALLOW_ANY> which applies to any of the syntactic malformations and
1365
1363
overflow, except for empty input. The other flags are analogous to ones in
1366
1364
the C<_GOT_> bits list in C<L</utf8_to_uv_msgs>>.
1367
1365
1366
+ =item C<UTF8_DIE_IF_MALFORMED>
1367
+
1368
+ If the function would otherwise return C<false>, it instead croaks. The
1369
+ C<UTF8_FORCE_WARN_IF_MALFORMED> flag is effectively turned on so that the cause
1370
+ of the croak is displayed.
1371
+
1372
+ This flag is ignored if C<UTF8_CHECK_ONLY> is also set.
1373
+
1368
1374
=back
1369
1375
1370
1376
=for apidoc utf8_to_uv_msgs
@@ -1516,6 +1522,8 @@ function creates a new AV to store information, described below, about all
1516
1522
the malformations that were encountered.
1517
1523
1518
1524
If the flag C<UTF8_CHECK_ONLY> is passed, this parameter is ignored.
1525
+ Otherwise, when this parameter is set, the flags C<UTF8_DIE_IF_MALFORMED> and
1526
+ C<UTF8_FORCE_WARN_IF_MALFORMED> are ignored.
1519
1527
1520
1528
What is considered a malformation is affected by C<flags>, the same as
1521
1529
described in C<L</utf8_to_uv_flags>>. No array element is generated for
@@ -1592,7 +1600,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1592
1600
const U8 * const e ,
1593
1601
UV * cp_p ,
1594
1602
Size_t * advance_p ,
1595
- const U32 flags ,
1603
+ U32 flags ,
1596
1604
U32 * errors ,
1597
1605
AV * * msgs )
1598
1606
{
@@ -1627,6 +1635,9 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1627
1635
}
1628
1636
if (UNLIKELY (msgs )) {
1629
1637
* msgs = NULL ;
1638
+
1639
+ /* The msgs parameter has higher priority than these flags */
1640
+ flags &= ~(UTF8_DIE_IF_MALFORMED |UTF8_FORCE_WARN_IF_MALFORMED );
1630
1641
}
1631
1642
1632
1643
/* Each of the affected Hanguls starts with \xED */
@@ -1998,12 +2009,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1998
2009
* the warning category to use for the message..
1999
2010
*
2000
2011
* No message need be generated if the UTF8_CHECK_ONLY flag has been
2001
- * set by the caller. Otherwise, a message should be generated if
2002
- * either:
2012
+ * set by the caller. Otherwise, a message should be generated if:
2003
2013
* 1) the caller has furnished a structure into which messages should
2004
2014
* be returned to it (so it itself can decide what to do); or
2005
2015
* 2) warnings are enabled for either of the category parameters to
2006
- * the macro.
2016
+ * the macro; or
2017
+ * 3) the special MALFORMED flags have been passed
2007
2018
*
2008
2019
* The 'warning' parameter is the higher priority warning category to
2009
2020
* check. The macro calls ckWARN_d(warning), so warnings for it are
@@ -2019,11 +2030,13 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2019
2030
*
2020
2031
* When called without a second category, the macro outputs a bunch of
2021
2032
* zeroes that the compiler should fold to nothing */
2022
- #define NEED_MESSAGE (warning , extra_ckWARN , extra_category ) \
2023
- ((flags & UTF8_CHECK_ONLY) ? 0 : \
2024
- ((ckWARN_d(warning)) ? warning : \
2025
- ((extra_ckWARN(extra_category +0)) ? extra_category +0 : \
2026
- ((msgs) ? warning : 0))))
2033
+ #define NEED_MESSAGE (warning , extra_ckWARN , extra_category ) \
2034
+ ((flags & UTF8_CHECK_ONLY) ? 0 : \
2035
+ ((ckWARN_d(warning)) ? warning : \
2036
+ ((extra_ckWARN(extra_category +0)) ? extra_category +0 : \
2037
+ ((flags & ( UTF8_DIE_IF_MALFORMED \
2038
+ |UTF8_FORCE_WARN_IF_MALFORMED)) ? warning : \
2039
+ ((msgs) ? warning : 0)))))
2027
2040
2028
2041
while (possible_problems ) { /* Handle each possible problem */
2029
2042
char * message = NULL ;
@@ -2484,11 +2497,35 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2484
2497
newRV_noinc ((SV * ) new_msg_hv (message , pack_warn ,
2485
2498
this_flag_bit )));
2486
2499
}
2487
- else if (PL_op )
2488
- Perl_warner (aTHX_ pack_warn , "%s in %s" , message ,
2489
- OP_DESC (PL_op ));
2490
- else
2491
- Perl_warner (aTHX_ pack_warn , "%s" , message );
2500
+ else if (! (flags & UTF8_CHECK_ONLY )) {
2501
+ if (UNLIKELY (flags & ( UTF8_DIE_IF_MALFORMED
2502
+ |UTF8_FORCE_WARN_IF_MALFORMED )))
2503
+ {
2504
+ ENTER ;
2505
+ SAVEI8 (PL_dowarn );
2506
+ SAVESPTR (PL_curcop );
2507
+
2508
+ PL_dowarn = G_WARN_ALL_ON |G_WARN_ON ;
2509
+ if (PL_curcop ) {
2510
+ SAVECURCOPWARNINGS ();
2511
+ PL_curcop -> cop_warnings = pWARN_ALL ;
2512
+ }
2513
+ }
2514
+
2515
+ if (PL_op ) {
2516
+ Perl_warner (aTHX_ pack_warn , "%s in %s" , message ,
2517
+ OP_DESC (PL_op ));
2518
+ }
2519
+ else {
2520
+ Perl_warner (aTHX_ pack_warn , "%s" , message );
2521
+ }
2522
+
2523
+ if (UNLIKELY (flags & ( UTF8_DIE_IF_MALFORMED
2524
+ |UTF8_FORCE_WARN_IF_MALFORMED )))
2525
+ {
2526
+ LEAVE ;
2527
+ }
2528
+ }
2492
2529
}
2493
2530
} /* End of 'while (possible_problems)' */
2494
2531
@@ -2508,6 +2545,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2508
2545
}
2509
2546
2510
2547
if (disallowed ) {
2548
+ if ((flags & ~UTF8_CHECK_ONLY ) & UTF8_DIE_IF_MALFORMED ) {
2549
+ Perl_croak (aTHX_ "Malformed UTF-8 character (fatal)" );
2550
+ }
2551
+
2511
2552
success = false;
2512
2553
uv = UNICODE_REPLACEMENT ;
2513
2554
}
0 commit comments