Skip to content

Commit a3b43aa

Browse files
committed
utf8.c: Generalize returns from 2 static functions
Previously these functions returned -1, 0, and 1. Now the +1 is replaced by a generalized value that future commits will take advantage of.
1 parent c093154 commit a3b43aa

File tree

1 file changed

+30
-20
lines changed

1 file changed

+30
-20
lines changed

utf8.c

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -475,18 +475,22 @@ PERL_STATIC_INLINE int
475475
S_is_utf8_overlong(const U8 * const s, const STRLEN len)
476476
{
477477
/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
478-
* 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
479-
* it isn't, and -1 if there isn't enough information to tell. This last
480-
* return value can happen if the sequence is incomplete, missing some
481-
* trailing bytes that would form a complete character. If there are
482-
* enough bytes to make a definitive decision, this function does so.
483-
* Usually 2 bytes are sufficient.
478+
* 's' + 'len' - 1 is an overlong. It returns a positive number if it is
479+
* an overlong; 0 if it isn't, and -1 if there isn't enough information to
480+
* tell. This last return value can happen if the sequence is incomplete,
481+
* missing some trailing bytes that would form a complete character. If
482+
* there are enough bytes to make a definitive decision, this function does
483+
* so.
484484
*
485-
* Overlongs can occur whenever the number of continuation bytes changes.
486-
* That means whenever the number of leading 1 bits in a start byte
487-
* increases from the next lower start byte. That happens for start bytes
488-
* C0, E0, F0, F8, FC, FE, and FF.
489-
*/
485+
* The positive number returned when it is overlong is how many bytes
486+
* needed to be examined to make that determination. Usually 1 or 2 bytes
487+
* are sufficient.
488+
*
489+
* Overlongs can occur for a few of the smallest start bytes or whenever
490+
* the number of continuation bytes changes. The latter means whenever the
491+
* number of leading 1 bits in a start byte increases from the next lower
492+
* start byte. That happens for start bytes C0, E0, F0, F8, FC, FE, and
493+
* FF. */
490494

491495
PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
492496

@@ -512,7 +516,7 @@ S_is_utf8_overlong(const U8 * const s, const STRLEN len)
512516
return 1;
513517
#else
514518
case 0xE0:
515-
return (len < 2) ? -1 : s[1] < 0xA0;
519+
return (len < 2) ? -1 : (s[1] < 0xA0) ? 2 : 0;
516520
#endif
517521

518522
case 0xF0:
@@ -522,8 +526,10 @@ S_is_utf8_overlong(const U8 * const s, const STRLEN len)
522526
return (len < 2)
523527
? -1 /* This pattern encapsulates
524528
* F0 => 0x10; F8 => 0x08; FC => 0x04; FF => 0x02 */
525-
: NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE
526-
+ 0x100 - NATIVE_UTF8_TO_I8(s[0]);
529+
: (NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE
530+
+ 0x100 - NATIVE_UTF8_TO_I8(s[0]))
531+
? 2
532+
: 0;
527533
case 0xFF:
528534
return isFF_overlong(s, len);
529535
}
@@ -533,11 +539,15 @@ PERL_STATIC_INLINE int
533539
S_isFF_overlong(const U8 * const s, const STRLEN len)
534540
{
535541
/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
536-
* 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
537-
* it isn't, and -1 if there isn't enough information to tell. This last
538-
* return value can happen if the sequence is incomplete, missing some
539-
* trailing bytes that would form a complete character. If there are
540-
* enough bytes to make a definitive decision, this function does so. */
542+
* 'e' - 1 is an overlong beginning with \xFF. It returns a positive
543+
* number if it is; 0 if it isn't, and -1 if there isn't enough
544+
* information to tell. This last return value can happen if the sequence
545+
* is incomplete, missing some trailing bytes that would form a complete
546+
* character. If there are enough bytes to make a definitive decision,
547+
* this function does so.
548+
*
549+
* A positive return gives the number of bytes needed to be examined to
550+
* make the determination */
541551

542552
PERL_ARGS_ASSERT_ISFF_OVERLONG;
543553

@@ -560,7 +570,7 @@ S_isFF_overlong(const U8 * const s, const STRLEN len)
560570
* be there; what comes after them doesn't matter. See tables in utf8.h,
561571
* utfebcdic.h. */
562572
if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
563-
return 1;
573+
return STRLENs(FF_OVERLONG_PREFIX);
564574
}
565575

566576
/* The missing bytes could cause the result to go one way or the other, so

0 commit comments

Comments
 (0)