Skip to content

Commit 8195a34

Browse files
committed
utf8_tu_uv_msgs: Comments, white-space only
This adds extensive comments and adjusts white space a bit.
1 parent a3b31bf commit 8195a34

File tree

1 file changed

+122
-32
lines changed

1 file changed

+122
-32
lines changed

utf8.c

Lines changed: 122 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,7 +1591,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
15911591
PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_;
15921592

15931593
const U8 * s = s0;
1594+
1595+
/* The ending position, plus 1, of the first character in the sequence
1596+
* beginning at s0. In other words, 'e', adjusted down to to be no more
1597+
* than a single character */
15941598
const U8 * send = e;
1599+
15951600
SSize_t curlen = send - s0;
15961601
U32 possible_problems; /* A bit is set here for each potential problem
15971602
found as we go along */
@@ -1603,10 +1608,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
16031608

16041609
dTHX;
16051610

1606-
/* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1607-
* non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1608-
* syllables that the dfa doesn't properly handle. Quickly dispose of the
1609-
* final case. */
1611+
/* Here, is one of:
1612+
* a) malformed;
1613+
* b) a problematic code point (surrogate, non-unicode, or nonchar); or
1614+
* c) on ASCII platforms, one of the Hangul syllables that the dfa
1615+
* doesn't properly handle. Quickly dispose of the final case.
1616+
*/
16101617

16111618
/* Assume will be successful; override later if necessary */
16121619
if (UNLIKELY(errors)) {
@@ -1652,7 +1659,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
16521659
errors = &discard_errors;
16531660
}
16541661

1655-
/* The order of malformation tests here is important. We should consume as
1662+
/* Accumulate the code point translation of the input byte sequence
1663+
* s0 .. e-1, looking for malformations.
1664+
*
1665+
* The order of malformation tests here is important. We should consume as
16561666
* few bytes as possible in order to not skip any valid character. This is
16571667
* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
16581668
* https://unicode.org/reports/tr36 for more discussion as to why. For
@@ -1711,8 +1721,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
17111721

17121722
/* Here is not a continuation byte, nor an invariant. The only thing left
17131723
* is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1714-
* because it excludes start bytes like \xC0 that always lead to
1715-
* overlongs.) */
1724+
* to check for sure because it excludes start bytes like \xC0 that always
1725+
* lead to overlongs.) */
17161726

17171727
/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
17181728
* that indicate the number of bytes in the character's whole UTF-8
@@ -1874,24 +1884,30 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
18741884
}
18751885
}
18761886
}
1877-
}
1887+
} /* End of checking if is a special code point */
18781888

18791889
ready_to_handle_errors: ;
18801890

18811891
/* At this point:
1882-
* curlen contains the number of bytes in the sequence that
1883-
* this call should advance the input by.
1884-
* avail_len gives the available number of bytes passed in, but
1885-
* only if this is less than the expected number of
1886-
* bytes, based on the code point's start byte.
1892+
* s0 points to the first byte of the character
1893+
* expectlen gives the number of bytes that the character is
1894+
* expected to occupy, based on the value of the
1895+
* presumed start byte in s0. This will be 0 if the
1896+
* sequence is empty, or 1 if s0 isn't actually a
1897+
* start byte.
1898+
* avail_len gives the number of bytes in the sequence this
1899+
* call can look at, one character's worth at most.
1900+
* curlen gives the number of bytes in the sequence that
1901+
* this call actually looked at. This is returned to
1902+
* the caller as the value they should advance the
1903+
* input by for the next call to this function.
18871904
* possible_problems is 0 if there weren't any problems; otherwise a bit
18881905
* is set in it for each potential problem found.
18891906
* uv contains the code point the input sequence
18901907
* represents; or if there is a problem that prevents
18911908
* a well-defined value from being computed, it is
18921909
* some substitute value, typically the REPLACEMENT
18931910
* CHARACTER.
1894-
* s0 points to the first byte of the character
18951911
* s points to just after where we left off processing
18961912
* the character
18971913
* send points to just after where that character should
@@ -1902,20 +1918,86 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
19021918
bool success = true;
19031919

19041920
if (UNLIKELY(possible_problems)) {
1921+
1922+
/* Here, the input sequence is potentially problematic. The code here
1923+
* determines if that is indeed the case and how to handle it. The
1924+
* possible outcomes are:
1925+
* 1) substituting the Unicode REPLACEMENT CHARACTER as the
1926+
* translation for this input sequence; and/or
1927+
* 2) returning information about the problem to the caller in
1928+
* *errors and/or *msgs; and/or
1929+
* 3) raising appropriate warnings.
1930+
*
1931+
* There are two main categories of potential problems.
1932+
*
1933+
* a) One type is by default not considered to be a problem. These
1934+
* are for when the input was syntactically valid
1935+
* Perl-extended-UTF-8 for a code point that is representable on
1936+
* this platform, but that code point isn't considered by Unicode
1937+
* to be freely exchangeable between applications. To get here,
1938+
* code earlier in this function has determined both that this
1939+
* sequence is for such a code point, and that the 'flags'
1940+
* parameter indicates that these are to be considered
1941+
* problematic, meaning this sequence should be rejected, merely
1942+
* warned about, or both. *errors will be set for each of these.
1943+
*
1944+
* If the caller to this function has set the corresponding
1945+
* DISALLOW bit in 'flags', the translation of this sequence will
1946+
* be the Unicode REPLACEMENT CHARACTER.
1947+
*
1948+
* If the caller to this function has set the corresponding WARN
1949+
* bit in 'flags' potentially a warning message will be generated,
1950+
* using the rules common to both types of problems, and detailed
1951+
* below.
1952+
*
1953+
* b) The other type is considered by default to be problematic.
1954+
* There are three subclasses:
1955+
* 1) Some syntactic malformation meant that no code point could
1956+
* be calculated for the input. An example is that the
1957+
* sequence was incomplete, more bytes were called for than
1958+
* the input contained. The function returns the Unicode
1959+
* REPLACEMENT CHARACTER as the translation of these.
1960+
* 2) The sequence is legal Perl extended UTF-8, but is for a
1961+
* code point too large to be represented on this platform.
1962+
* The function returns the Unicode REPLACEMENT CHARACTER as
1963+
* the translation of these.
1964+
* 3) The sequence represents a code point which can also be
1965+
* represented by a shorter sequence. These have been
1966+
* declared illegal by Unicode fiat because they were being
1967+
* used as Trojan horses to successfully attack applications.
1968+
* One undocumented flag causes these to be accepted, but
1969+
* otherwise the function returns the Unicode REPLACEMENT
1970+
* CHARACTER as the translation of these.
1971+
*
1972+
* In all cases the corresponding bit in *errors is set. This is
1973+
* in contrast to the other type of problem where the input
1974+
* 'flags' affect if the bit is set or not.
1975+
*
1976+
* The default is to generate a warning for each of these. If the
1977+
* input 'flags' has a corresponding ALLOW flag, warnings are
1978+
* suppressed. The only other thing the ALLOW flags do is
1979+
* determine if the function returns sucess or failure
1980+
*
1981+
* For both types of problems, if warnings are called for by the input
1982+
* flags, also setting the UTF8_CHECK_ONLY flag overrides
1983+
* generating them. If 'msgs' is not NULL, they all will be returned
1984+
* there; otherwise they will be raised if warnings are enabled.
1985+
*/
1986+
19051987
bool disallowed = FALSE;
19061988
const U32 orig_problems = possible_problems;
19071989

1908-
/* Returns 0 if no message needs to be generated for this problem even
1909-
* if everything else says to. Otherwise returns the warning category
1910-
* to use for the message.
1990+
/* The following macro returns 0 if no message needs to be generated
1991+
* for this problem even if everything else says to. Otherwise returns
1992+
* the warning category to use for the message..
19111993
*
19121994
* No message need be generated if the UTF8_CHECK_ONLY flag has been
19131995
* set by the caller. Otherwise, a message should be generated if
19141996
* either:
19151997
* 1) the caller has furnished a structure into which messages should
19161998
* be returned to it (so it itself can decide what to do); or
1917-
* 2) warnings are enabled for either of the category parameters to the
1918-
* macro.
1999+
* 2) warnings are enabled for either of the category parameters to
2000+
* the macro.
19192001
*
19202002
* The 'warning' parameter is the higher priority warning category to
19212003
* check. The macro calls ckWARN_d(warning), so warnings for it are
@@ -1940,21 +2022,29 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
19402022
while (possible_problems) { /* Handle each possible problem */
19412023
char * message = NULL;
19422024

1943-
/* Each 'case' handles one problem given by a bit in
1944-
* 'possible_problems'. The lowest bit positions, as #defined in
1945-
* utf8.h, are are handled first. Some of the ordering is
1946-
* important so that higher priority items are done before lower
1947-
* ones; some of which may depend on earlier actions. Also the
1948-
* ordering tries to cause any messages to be displayed in kind of
1949-
* decreasing severity order. But the overlong must come last, as
1950-
* it changes 'uv' looked at by the others */
2025+
/* The lowest bit positions, as #defined in utf8.h, are handled
2026+
* first. Some of the ordering is important so that higher
2027+
* priority items are done before lower ones; some of which may
2028+
* depend on earlier actions. Also the ordering tries to cause any
2029+
* messages to be displayed in kind of decreasing severity order.
2030+
* But the overlong must come last, as it changes 'uv' looked at by
2031+
* the others */
19512032

19522033
U32 this_problem = 1U << lsbit_pos32(possible_problems);
19532034

19542035
U32 this_flag_bit = this_problem;
19552036

2037+
/* Turn off so next iteration doesn't retry this */
19562038
possible_problems &= ~this_problem;
19572039

2040+
/* The code is structured so that there is a case: in a switch()
2041+
* for each problem type, so as to handle the different details of
2042+
* each. The only common part after setting things up is the
2043+
* handling of any generated warning message. That means that if a
2044+
* case: finds there is no message, it can 'continue' to the next
2045+
* loop iteration instead of doing a 'break', whose only purpose
2046+
* would be to handle the message. */
2047+
19582048
/* Most case:s use this; overridden in a few */
19592049
U32 pack_warn = packWARN(WARN_UTF8);
19602050

@@ -2088,8 +2178,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
20882178
if (NEED_MESSAGE(WARN_NONCHAR,,)) {
20892179
/* The code above should have guaranteed that we don't
20902180
* get here with errors other than overlong */
2091-
assert (! (orig_problems
2092-
& ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2181+
assert (! ( orig_problems
2182+
& ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
20932183

20942184
pack_warn = packWARN(WARN_NONCHAR);
20952185
message = Perl_form(aTHX_ nonchar_cp_format, uv);
@@ -2301,8 +2391,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23012391
}
23022392

23032393
av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2304-
pack_warn,
2305-
this_flag_bit)));
2394+
pack_warn,
2395+
this_flag_bit)));
23062396
}
23072397
else if (PL_op)
23082398
Perl_warner(aTHX_ pack_warn, "%s in %s", message,
@@ -2323,7 +2413,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23232413
success = false;
23242414
uv = UNICODE_REPLACEMENT;
23252415
}
2326-
}
2416+
} /* End of there was a possible problem */
23272417

23282418
*cp_p = UNI_TO_NATIVE(uv);
23292419
return success;

0 commit comments

Comments
 (0)