@@ -475,18 +475,22 @@ PERL_STATIC_INLINE int
475
475
S_is_utf8_overlong (const U8 * const s , const STRLEN len )
476
476
{
477
477
/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
478
- * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
479
- * it isn't, and -1 if there isn't enough information to tell. This last
480
- * return value can happen if the sequence is incomplete, missing some
481
- * trailing bytes that would form a complete character. If there are
482
- * enough bytes to make a definitive decision, this function does so.
483
- * Usually 2 bytes are sufficient .
478
+ * 's' + 'len' - 1 is an overlong. It returns a positive number if it is
479
+ * an overlong; 0 if it isn't, and -1 if there isn't enough information to
480
+ * tell. This last return value can happen if the sequence is incomplete,
481
+ * missing some trailing bytes that would form a complete character. If
482
+ * there are enough bytes to make a definitive decision, this function does
483
+ * so .
484
484
*
485
- * Overlongs can occur whenever the number of continuation bytes changes.
486
- * That means whenever the number of leading 1 bits in a start byte
487
- * increases from the next lower start byte. That happens for start bytes
488
- * C0, E0, F0, F8, FC, FE, and FF.
489
- */
485
+ * The positive number returned when it is overlong is how many bytes
486
+ * needed to be examined to make that determination. Usually 1 or 2 bytes
487
+ * are sufficient.
488
+ *
489
+ * Overlongs can occur for a few of the smallest start bytes or whenever
490
+ * the number of continuation bytes changes. The latter means whenever the
491
+ * number of leading 1 bits in a start byte increases from the next lower
492
+ * start byte. That happens for start bytes C0, E0, F0, F8, FC, FE, and
493
+ * FF. */
490
494
491
495
PERL_ARGS_ASSERT_IS_UTF8_OVERLONG ;
492
496
@@ -512,7 +516,7 @@ S_is_utf8_overlong(const U8 * const s, const STRLEN len)
512
516
return 1 ;
513
517
#else
514
518
case 0xE0 :
515
- return (len < 2 ) ? -1 : s [1 ] < 0xA0 ;
519
+ return (len < 2 ) ? -1 : ( s [1 ] < 0xA0 ) ? 2 : 0 ;
516
520
#endif
517
521
518
522
case 0xF0 :
@@ -522,8 +526,10 @@ S_is_utf8_overlong(const U8 * const s, const STRLEN len)
522
526
return (len < 2 )
523
527
? -1 /* This pattern encapsulates
524
528
* F0 => 0x10; F8 => 0x08; FC => 0x04; FF => 0x02 */
525
- : NATIVE_UTF8_TO_I8 (s [1 ]) < UTF_MIN_CONTINUATION_BYTE
526
- + 0x100 - NATIVE_UTF8_TO_I8 (s [0 ]);
529
+ : (NATIVE_UTF8_TO_I8 (s [1 ]) < UTF_MIN_CONTINUATION_BYTE
530
+ + 0x100 - NATIVE_UTF8_TO_I8 (s [0 ]))
531
+ ? 2
532
+ : 0 ;
527
533
case 0xFF :
528
534
return isFF_overlong (s , len );
529
535
}
@@ -533,11 +539,15 @@ PERL_STATIC_INLINE int
533
539
S_isFF_overlong (const U8 * const s , const STRLEN len )
534
540
{
535
541
/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
536
- * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
537
- * it isn't, and -1 if there isn't enough information to tell. This last
538
- * return value can happen if the sequence is incomplete, missing some
539
- * trailing bytes that would form a complete character. If there are
540
- * enough bytes to make a definitive decision, this function does so. */
542
+ * 'e' - 1 is an overlong beginning with \xFF. It returns a positive
543
+ * number if it is; 0 if it isn't, and -1 if there isn't enough
544
+ * information to tell. This last return value can happen if the sequence
545
+ * is incomplete, missing some trailing bytes that would form a complete
546
+ * character. If there are enough bytes to make a definitive decision,
547
+ * this function does so.
548
+ *
549
+ * A positive return gives the number of bytes needed to be examined to
550
+ * make the determination */
541
551
542
552
PERL_ARGS_ASSERT_ISFF_OVERLONG ;
543
553
@@ -560,7 +570,7 @@ S_isFF_overlong(const U8 * const s, const STRLEN len)
560
570
* be there; what comes after them doesn't matter. See tables in utf8.h,
561
571
* utfebcdic.h. */
562
572
if (len >= STRLENs (FF_OVERLONG_PREFIX )) {
563
- return 1 ;
573
+ return STRLENs ( FF_OVERLONG_PREFIX ) ;
564
574
}
565
575
566
576
/* The missing bytes could cause the result to go one way or the other, so
0 commit comments