@@ -1801,7 +1801,11 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1801
1801
* small code point, it is still using this Perl invention, so mark it
1802
1802
* as such */
1803
1803
if (UNLIKELY (UTF8_IS_PERL_EXTENDED (s0 ))) {
1804
- possible_problems |= UTF8_GOT_SUPER ;
1804
+ if (flags & ( UTF8_DISALLOW_PERL_EXTENDED |UTF8_DISALLOW_SUPER
1805
+ |UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER ))
1806
+ {
1807
+ possible_problems |= UTF8_GOT_PERL_EXTENDED ;
1808
+ }
1805
1809
}
1806
1810
else {
1807
1811
/* See if the input has malformations besides possibly overlong */
@@ -2239,56 +2243,177 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2239
2243
}
2240
2244
2241
2245
break ;
2246
+
2247
+ /* The remaining cases all involve non-Unicode code points.
2248
+ * These come in three increasingly restrictive flavors.
2249
+ * SUPERs are simply all the ones above Unicode;
2250
+ * PERL_EXTENDED_UTF8 are the subset of these that are
2251
+ * expressed in a non-standard extension to UTF-8. Unless also
2252
+ * overlong, these have a very high ordinal value. Finally
2253
+ * OVERFLOWS are for such a high code point that they don't fit
2254
+ * into the word size of this platform. Perl extended-UTF-8 is
2255
+ * required to express code points this high. So an overflow
2256
+ * is a member of all three flavors; besides overflowing, it
2257
+ * also is using perl extended UTF-8 and is also plain
2258
+ * non-Unicode.
2259
+ *
2260
+ * There are cases in this switch for each of the three types.
2261
+ * Because they are related, there are tests of the input flags
2262
+ * to see what combination of these require warnings and/or
2263
+ * rejection. And there a jumps between the cases. The task
2264
+ * is simpler because the code earlier in the function has set
2265
+ * things up so that at most one problem flag bit is set for
2266
+ * any of them, the most restrictive case the input matches.
2267
+ * Also, for the non-overflow cases, there is no problem flag
2268
+ * bit if the caller doesn't want special handling for it.
2269
+ *
2270
+ * Each type has its own warning category and text,
2271
+ * corresponding to the specific problem. Whenever a warning
2272
+ * is generated, it uses the one for the most dire type the
2273
+ * code point fits into. Suppose the flags say we warn on all
2274
+ * non-Unicode code points, but not on overflowing and we get a
2275
+ * code point too large for the platform. The generated
2276
+ * warning will be the text that says it overflowed, while the
2277
+ * returned bit will be for the SUPER type. To accomplish
2278
+ * this, the formats are shared between the cases. 'cp_format'
2279
+ * is used if there is a specific representable code point that
2280
+ * the input translates to; if not, instead a more generic
2281
+ * format, 'non_cp_format' is used */
2282
+ const char * cp_format ;
2283
+ const char * non_cp_format ;
2284
+
2242
2285
case UTF8_GOT_OVERFLOW :
2286
+ uv = UNICODE_REPLACEMENT ; /* Can't represent this on this
2287
+ platform */
2288
+ /* For this overflow case, any format and message text are set
2289
+ * up to create the warning for it. If overflows are to be
2290
+ * rejected, the warning is simply created, and we break to the
2291
+ * end of the switch() (where code common to all cases will
2292
+ * finish the job). Otherwise it looks to see if either the
2293
+ * perl-extended or plain super cases are supposed to handle
2294
+ * things. If so, it jumps into the code of the most
2295
+ * restrictive one so that that they will use this more dire
2296
+ * warning. If neither handle it, the code just breaks; doing
2297
+ * nothing. */
2298
+ non_cp_format = MALFORMED_TEXT ": %s (overflows)" ;
2299
+
2300
+ /* We can't exactly specify such a large code point, so can't
2301
+ * output it */
2302
+ cp_format = NULL ;
2303
+
2304
+ /* In the unlikely case that the caller has asked to "allow"
2305
+ * this malformation, we transfer to the next lower severity of
2306
+ * code that handles the case; or just 'break' if none. */
2307
+ if (UNLIKELY (flags & UTF8_ALLOW_OVERFLOW )) {
2308
+ if (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2309
+ |UTF8_WARN_PERL_EXTENDED ))
2310
+ {
2311
+ this_flag_bit = UTF8_GOT_PERL_EXTENDED ;
2312
+ goto join_perl_extended ;
2313
+ }
2314
+ if (flags & (UTF8_DISALLOW_SUPER |UTF8_WARN_SUPER )) {
2315
+ this_flag_bit = UTF8_GOT_SUPER ;
2316
+ goto join_plain_supers ;
2317
+ }
2243
2318
2244
- /* Overflow means also got a super and are using Perl's
2245
- * extended UTF-8, but we handle all three cases here */
2246
- possible_problems &= ~(UTF8_GOT_SUPER |UTF8_GOT_PERL_EXTENDED );
2247
- uv = UNICODE_REPLACEMENT ;
2319
+ break ;
2320
+ }
2248
2321
2249
- /* But the API says we flag all errors found */
2250
- if (flags & (UTF8_WARN_SUPER |UTF8_DISALLOW_SUPER )) {
2251
- error_flags_return |= UTF8_GOT_SUPER ;
2322
+ /* Here, overflow is disallowed; handle everything in this
2323
+ * case: */
2324
+ disallowed = true;
2325
+
2326
+ /* Overflow is a hybrid. If the word size on this platform
2327
+ * were wide enough for this to not overflow, a non-Unicode
2328
+ * code point would have been generated. If the caller wanted
2329
+ * warnings for such code points, the warning category would be
2330
+ * WARN_NON_UNICODE, On the other hand, overflow is considered
2331
+ * a malformation, which is serious, and the category would be
2332
+ * just WARN_UTF8. We clearly should warn if either category
2333
+ * is enabled, but which category to use? Historically, we've
2334
+ * used 'utf8' if it is enabled; and that seems like the more
2335
+ * severe category, more befitting a malformation. */
2336
+ pack_warn = NEED_MESSAGE (WARN_UTF8 , ckWARN_d , WARN_NON_UNICODE );
2337
+ if (pack_warn ) {
2338
+ message = Perl_form (aTHX_ non_cp_format ,
2339
+ _byte_dump_string (s0 , curlen , 0 ));
2252
2340
}
2253
- if (flags
2254
- & (UTF8_WARN_PERL_EXTENDED |UTF8_DISALLOW_PERL_EXTENDED ))
2341
+
2342
+ /* But the API says we flag all errors found that the calling
2343
+ * flags indicate should be */
2344
+ if (flags & ( UTF8_WARN_PERL_EXTENDED
2345
+ |UTF8_DISALLOW_PERL_EXTENDED ))
2255
2346
{
2256
2347
error_flags_return |= UTF8_GOT_PERL_EXTENDED ;
2257
2348
}
2258
-
2259
- /* Disallow if any of the three categories say to */
2260
- if ( ! (flags & UTF8_ALLOW_OVERFLOW )
2261
- || (flags & ( UTF8_DISALLOW_SUPER
2262
- |UTF8_DISALLOW_PERL_EXTENDED )))
2263
- {
2264
- disallowed = TRUE;
2349
+ if (flags & (UTF8_WARN_SUPER |UTF8_DISALLOW_SUPER )) {
2350
+ error_flags_return |= UTF8_GOT_SUPER ;
2265
2351
}
2266
2352
2267
- /* Likewise, warn if any say to */
2268
- if ( ! (flags & UTF8_ALLOW_OVERFLOW )
2269
- || (flags & (UTF8_WARN_SUPER |UTF8_WARN_PERL_EXTENDED )))
2353
+ break ;
2354
+
2355
+ case UTF8_GOT_PERL_EXTENDED :
2356
+
2357
+ /* We get here when the input uses Perl extended UTF-8, and the
2358
+ * caller has indicated that above-Unicode code points (of
2359
+ * which these are a subset) are to be disallowed and/or warned
2360
+ * about
2361
+ *
2362
+ * Set up the formats. We can include the code point in the
2363
+ * message if we have an exact one (input not too short) and
2364
+ * it's not an overlong that reduces down to something too low.
2365
+ * (Otherwise, the message could say something untrue like
2366
+ * "Code point 0x41 is not Unicode ...". But this would be a
2367
+ * lie; 0x41 is Unicode. It was expressed in a non-standard
2368
+ * form of UTF-8 that Unicode doesn't approve of.) */
2369
+ cp_format = ( (orig_problems & (UTF8_GOT_TOO_SHORT ))
2370
+ || ! UNICODE_IS_PERL_EXTENDED (uv ))
2371
+ ? NULL
2372
+ : PL_extended_cp_format ;
2373
+ non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2374
+ " is a Perl extension, and so is not portable" ;
2375
+
2376
+ /* We know here that the caller indicated at least one of the
2377
+ * EXTENDED or SUPER flags. If it's not EXTENDED, use SUPER */
2378
+ if (! (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2379
+ |UTF8_WARN_PERL_EXTENDED )))
2270
2380
{
2381
+ this_flag_bit = UTF8_GOT_SUPER ;
2382
+ }
2271
2383
2272
- /* Overflow is a hybrid. If the word size on this platform
2273
- * were wide enough for this to not overflow, a non-Unicode
2274
- * code point would have been generated. If the caller
2275
- * wanted warnings for such code points, the warning
2276
- * category would be WARN_NON_UNICODE, On the other hand,
2277
- * overflow is considered a malformation, which is serious,
2278
- * and the category would be just WARN_UTF8. We clearly
2279
- * should warn if either category is enabled, but which
2280
- * category to use? Historically, we've used 'utf8' if it
2281
- * is enabled; and that seems like the more severe
2282
- * category, more befitting a malformation. */
2283
- pack_warn = NEED_MESSAGE (WARN_UTF8 ,
2284
- ckWARN_d , WARN_NON_UNICODE );
2285
- if (pack_warn ) {
2286
- message = Perl_form (aTHX_ MALFORMED_TEXT
2287
- ": %s (overflows)" ,
2288
- _byte_dump_string (s0 , curlen , 0 ));
2384
+ join_perl_extended :
2385
+
2386
+ /* Here this level is to warn, reject, or both. The format has
2387
+ * been set up to be for this level, or maybe the overflow
2388
+ * case set up a more dire warning and jumped to the label just
2389
+ * above (after determining that warning/rejecting here was
2390
+ * enabled). We warn at this level if either it is supposed to
2391
+ * warn, or plain supers are supposed to. In the latter case,
2392
+ * we get this higher severity warning */
2393
+ if (flags & (UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER )) {
2394
+ error_flags_return |= this_flag_bit ;
2395
+
2396
+ /* These code points are non-portable, so warn if either
2397
+ * category is enabled */
2398
+ if (NEED_MESSAGE (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE )) {
2399
+ pack_warn = packWARN2 (WARN_NON_UNICODE , WARN_PORTABLE );
2400
+ if (cp_format ) {
2401
+ message = Perl_form (aTHX_ cp_format , uv );
2402
+ }
2403
+ else {
2404
+ message = Perl_form (aTHX_
2405
+ non_cp_format ,
2406
+ _byte_dump_string (s0 , curlen , 0 ));
2407
+ }
2289
2408
}
2290
2409
}
2291
2410
2411
+ /* Similarly if either of the two levels reject this, do it */
2412
+ if (flags & (UTF8_DISALLOW_PERL_EXTENDED |UTF8_DISALLOW_SUPER )) {
2413
+ disallowed = true;
2414
+ error_flags_return |= this_flag_bit ;
2415
+ }
2416
+
2292
2417
break ;
2293
2418
2294
2419
case UTF8_GOT_SUPER :
@@ -2298,71 +2423,41 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2298
2423
* caller has indicated that these are to be disallowed and/or
2299
2424
* warned about */
2300
2425
2301
- if (flags & UTF8_WARN_SUPER ) {
2302
- error_flags_return |= UTF8_GOT_SUPER ;
2426
+ non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2427
+ " is for a non-Unicode code point, may not be"
2428
+ " portable" ;
2303
2429
2304
- if (NEED_MESSAGE (WARN_NON_UNICODE ,,)) {
2305
- pack_warn = packWARN (WARN_NON_UNICODE );
2430
+ /* We can include the code point in the message if we have an
2431
+ * exact one (input not too short) */
2432
+ cp_format = (orig_problems & (UTF8_GOT_TOO_SHORT ))
2433
+ ? NULL
2434
+ : super_cp_format ;
2306
2435
2307
- if (orig_problems & UTF8_GOT_TOO_SHORT ) {
2308
- message = Perl_form (aTHX_
2309
- "Any UTF-8 sequence that starts with"
2310
- " \"%s\" is for a non-Unicode code point,"
2311
- " may not be portable" ,
2312
- _byte_dump_string (s0 , curlen , 0 ));
2313
- }
2314
- else {
2315
- message = Perl_form (aTHX_ super_cp_format , uv );
2316
- }
2317
- }
2318
- }
2436
+ join_plain_supers :
2319
2437
2320
- /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2321
- * and before possibly bailing out, so that the more dire
2322
- * warning will override the regular one. */
2323
- if (UNLIKELY (UTF8_IS_PERL_EXTENDED (s0 ))) {
2324
- if ( (flags & (UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER ))
2325
- && NEED_MESSAGE (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE ))
2326
- {
2327
- pack_warn = packWARN2 (WARN_NON_UNICODE , WARN_PORTABLE );
2328
-
2329
- /* If it is an overlong that evaluates to a code point
2330
- * that doesn't have to use the Perl extended UTF-8, it
2331
- * still used it, and so we output a message that
2332
- * doesn't refer to the code point. The same is true
2333
- * if there was a SHORT malformation where the code
2334
- * point is not valid. In that case, 'uv' will have
2335
- * been set to the REPLACEMENT CHAR, and the message
2336
- * below without the code point in it will be selected
2337
- * */
2338
- if (UNICODE_IS_PERL_EXTENDED (uv )) {
2339
- message = Perl_form (aTHX_
2340
- PL_extended_cp_format , uv );
2438
+ /* Here this level is to warn, reject, or both. The format has
2439
+ * been set up to be for this level, or maybe the overflow
2440
+ * case set up a more dire warning and jumped to the label just
2441
+ * above (after determining that warning/rejecting here was
2442
+ * enabled). */
2443
+ if (flags & UTF8_WARN_SUPER ) {
2444
+ error_flags_return |= this_flag_bit ;
2445
+ if (NEED_MESSAGE (WARN_NON_UNICODE ,,)) {
2446
+ pack_warn = packWARN (WARN_NON_UNICODE );
2447
+ if (cp_format ) {
2448
+ message = Perl_form (aTHX_ cp_format , uv );
2341
2449
}
2342
2450
else {
2343
2451
message = Perl_form (aTHX_
2344
- "Any UTF-8 sequence that starts with"
2345
- " \"%s\" is a Perl extension, and"
2346
- " so is not portable" ,
2347
- _byte_dump_string (s0 , curlen , 0 ));
2348
- }
2349
- this_flag_bit = UTF8_GOT_PERL_EXTENDED ;
2350
- }
2351
-
2352
- if (flags & ( UTF8_WARN_PERL_EXTENDED
2353
- |UTF8_DISALLOW_PERL_EXTENDED ))
2354
- {
2355
- error_flags_return |= UTF8_GOT_PERL_EXTENDED ;
2356
-
2357
- if (flags & UTF8_DISALLOW_PERL_EXTENDED ) {
2358
- disallowed = TRUE;
2452
+ non_cp_format ,
2453
+ _byte_dump_string (s0 , curlen , 0 ));
2359
2454
}
2360
2455
}
2361
2456
}
2362
2457
2363
2458
if (flags & UTF8_DISALLOW_SUPER ) {
2364
- error_flags_return |= UTF8_GOT_SUPER ;
2365
- disallowed = TRUE ;
2459
+ error_flags_return |= this_flag_bit ;
2460
+ disallowed = true ;
2366
2461
}
2367
2462
2368
2463
break ;
0 commit comments