@@ -1591,7 +1591,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1591
1591
PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_ ;
1592
1592
1593
1593
const U8 * s = s0 ;
1594
+
1595
+ /* The ending position, plus 1, of the first character in the sequence
1596
+ * beginning at s0. In other words, 'e', adjusted down to to be no more
1597
+ * than a single character */
1594
1598
const U8 * send = e ;
1599
+
1595
1600
SSize_t curlen = send - s0 ;
1596
1601
U32 possible_problems ; /* A bit is set here for each potential problem
1597
1602
found as we go along */
@@ -1603,10 +1608,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1603
1608
1604
1609
dTHX ;
1605
1610
1606
- /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1607
- * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1608
- * syllables that the dfa doesn't properly handle. Quickly dispose of the
1609
- * final case. */
1611
+ /* Here, is one of:
1612
+ * a) malformed;
1613
+ * b) a problematic code point (surrogate, non-unicode, or nonchar); or
1614
+ * c) on ASCII platforms, one of the Hangul syllables that the dfa
1615
+ * doesn't properly handle. Quickly dispose of the final case.
1616
+ */
1610
1617
1611
1618
/* Assume will be successful; override later if necessary */
1612
1619
if (UNLIKELY (errors )) {
@@ -1652,7 +1659,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1652
1659
errors = & discard_errors ;
1653
1660
}
1654
1661
1655
- /* The order of malformation tests here is important. We should consume as
1662
+ /* Accumulate the code point translation of the input byte sequence
1663
+ * s0 .. e-1, looking for malformations.
1664
+ *
1665
+ * The order of malformation tests here is important. We should consume as
1656
1666
* few bytes as possible in order to not skip any valid character. This is
1657
1667
* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1658
1668
* https://unicode.org/reports/tr36 for more discussion as to why. For
@@ -1711,8 +1721,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1711
1721
1712
1722
/* Here is not a continuation byte, nor an invariant. The only thing left
1713
1723
* is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1714
- * because it excludes start bytes like \xC0 that always lead to
1715
- * overlongs.) */
1724
+ * to check for sure because it excludes start bytes like \xC0 that always
1725
+ * lead to overlongs.) */
1716
1726
1717
1727
/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1718
1728
* that indicate the number of bytes in the character's whole UTF-8
@@ -1874,24 +1884,30 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1874
1884
}
1875
1885
}
1876
1886
}
1877
- }
1887
+ } /* End of checking if is a special code point */
1878
1888
1879
1889
ready_to_handle_errors : ;
1880
1890
1881
1891
/* At this point:
1882
- * curlen contains the number of bytes in the sequence that
1883
- * this call should advance the input by.
1884
- * avail_len gives the available number of bytes passed in, but
1885
- * only if this is less than the expected number of
1886
- * bytes, based on the code point's start byte.
1892
+ * s0 points to the first byte of the character
1893
+ * expectlen gives the number of bytes that the character is
1894
+ * expected to occupy, based on the value of the
1895
+ * presumed start byte in s0. This will be 0 if the
1896
+ * sequence is empty, or 1 if s0 isn't actually a
1897
+ * start byte.
1898
+ * avail_len gives the number of bytes in the sequence this
1899
+ * call can look at, one character's worth at most.
1900
+ * curlen gives the number of bytes in the sequence that
1901
+ * this call actually looked at. This is returned to
1902
+ * the caller as the value they should advance the
1903
+ * input by for the next call to this function.
1887
1904
* possible_problems is 0 if there weren't any problems; otherwise a bit
1888
1905
* is set in it for each potential problem found.
1889
1906
* uv contains the code point the input sequence
1890
1907
* represents; or if there is a problem that prevents
1891
1908
* a well-defined value from being computed, it is
1892
1909
* some substitute value, typically the REPLACEMENT
1893
1910
* CHARACTER.
1894
- * s0 points to the first byte of the character
1895
1911
* s points to just after where we left off processing
1896
1912
* the character
1897
1913
* send points to just after where that character should
@@ -1902,20 +1918,86 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1902
1918
bool success = true;
1903
1919
1904
1920
if (UNLIKELY (possible_problems )) {
1921
+
1922
+ /* Here, the input sequence is potentially problematic. The code here
1923
+ * determines if that is indeed the case and how to handle it. The
1924
+ * possible outcomes are:
1925
+ * 1) substituting the Unicode REPLACEMENT CHARACTER as the
1926
+ * translation for this input sequence; and/or
1927
+ * 2) returning information about the problem to the caller in
1928
+ * *errors and/or *msgs; and/or
1929
+ * 3) raising appropriate warnings.
1930
+ *
1931
+ * There are two main categories of potential problems.
1932
+ *
1933
+ * a) One type is by default not considered to be a problem. These
1934
+ * are for when the input was syntactically valid
1935
+ * Perl-extended-UTF-8 for a code point that is representable on
1936
+ * this platform, but that code point isn't considered by Unicode
1937
+ * to be freely exchangeable between applications. To get here,
1938
+ * code earlier in this function has determined both that this
1939
+ * sequence is for such a code point, and that the 'flags'
1940
+ * parameter indicates that these are to be considered
1941
+ * problematic, meaning this sequence should be rejected, merely
1942
+ * warned about, or both. *errors will be set for each of these.
1943
+ *
1944
+ * If the caller to this function has set the corresponding
1945
+ * DISALLOW bit in 'flags', the translation of this sequence will
1946
+ * be the Unicode REPLACEMENT CHARACTER.
1947
+ *
1948
+ * If the caller to this function has set the corresponding WARN
1949
+ * bit in 'flags' potentially a warning message will be generated,
1950
+ * using the rules common to both types of problems, and detailed
1951
+ * below.
1952
+ *
1953
+ * b) The other type is considered by default to be problematic.
1954
+ * There are three subclasses:
1955
+ * 1) Some syntactic malformation meant that no code point could
1956
+ * be calculated for the input. An example is that the
1957
+ * sequence was incomplete, more bytes were called for than
1958
+ * the input contained. The function returns the Unicode
1959
+ * REPLACEMENT CHARACTER as the translation of these.
1960
+ * 2) The sequence is legal Perl extended UTF-8, but is for a
1961
+ * code point too large to be represented on this platform.
1962
+ * The function returns the Unicode REPLACEMENT CHARACTER as
1963
+ * the translation of these.
1964
+ * 3) The sequence represents a code point which can also be
1965
+ * represented by a shorter sequence. These have been
1966
+ * declared illegal by Unicode fiat because they were being
1967
+ * used as Trojan horses to successfully attack applications.
1968
+ * One undocumented flag causes these to be accepted, but
1969
+ * otherwise the function returns the Unicode REPLACEMENT
1970
+ * CHARACTER as the translation of these.
1971
+ *
1972
+ * In all cases the corresponding bit in *errors is set. This is
1973
+ * in contrast to the other type of problem where the input
1974
+ * 'flags' affect if the bit is set or not.
1975
+ *
1976
+ * The default is to generate a warning for each of these. If the
1977
+ * input 'flags' has a corresponding ALLOW flag, warnings are
1978
+ * suppressed. The only other thing the ALLOW flags do is
1979
+ * determine if the function returns sucess or failure
1980
+ *
1981
+ * For both types of problems, if warnings are called for by the input
1982
+ * flags, also setting the UTF8_CHECK_ONLY flag overrides
1983
+ * generating them. If 'msgs' is not NULL, they all will be returned
1984
+ * there; otherwise they will be raised if warnings are enabled.
1985
+ */
1986
+
1905
1987
bool disallowed = FALSE;
1906
1988
const U32 orig_problems = possible_problems ;
1907
1989
1908
- /* Returns 0 if no message needs to be generated for this problem even
1909
- * if everything else says to. Otherwise returns the warning category
1910
- * to use for the message.
1990
+ /* The following macro returns 0 if no message needs to be generated
1991
+ * for this problem even if everything else says to. Otherwise returns
1992
+ * the warning category to use for the message. .
1911
1993
*
1912
1994
* No message need be generated if the UTF8_CHECK_ONLY flag has been
1913
1995
* set by the caller. Otherwise, a message should be generated if
1914
1996
* either:
1915
1997
* 1) the caller has furnished a structure into which messages should
1916
1998
* be returned to it (so it itself can decide what to do); or
1917
- * 2) warnings are enabled for either of the category parameters to the
1918
- * macro.
1999
+ * 2) warnings are enabled for either of the category parameters to
2000
+ * the macro.
1919
2001
*
1920
2002
* The 'warning' parameter is the higher priority warning category to
1921
2003
* check. The macro calls ckWARN_d(warning), so warnings for it are
@@ -1940,21 +2022,29 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1940
2022
while (possible_problems ) { /* Handle each possible problem */
1941
2023
char * message = NULL ;
1942
2024
1943
- /* Each 'case' handles one problem given by a bit in
1944
- * 'possible_problems'. The lowest bit positions, as #defined in
1945
- * utf8.h, are are handled first. Some of the ordering is
1946
- * important so that higher priority items are done before lower
1947
- * ones; some of which may depend on earlier actions. Also the
1948
- * ordering tries to cause any messages to be displayed in kind of
1949
- * decreasing severity order. But the overlong must come last, as
1950
- * it changes 'uv' looked at by the others */
2025
+ /* The lowest bit positions, as #defined in utf8.h, are handled
2026
+ * first. Some of the ordering is important so that higher
2027
+ * priority items are done before lower ones; some of which may
2028
+ * depend on earlier actions. Also the ordering tries to cause any
2029
+ * messages to be displayed in kind of decreasing severity order.
2030
+ * But the overlong must come last, as it changes 'uv' looked at by
2031
+ * the others */
1951
2032
1952
2033
U32 this_problem = 1U << lsbit_pos32 (possible_problems );
1953
2034
1954
2035
U32 this_flag_bit = this_problem ;
1955
2036
2037
+ /* Turn off so next iteration doesn't retry this */
1956
2038
possible_problems &= ~this_problem ;
1957
2039
2040
+ /* The code is structured so that there is a case: in a switch()
2041
+ * for each problem type, so as to handle the different details of
2042
+ * each. The only common part after setting things up is the
2043
+ * handling of any generated warning message. That means that if a
2044
+ * case: finds there is no message, it can 'continue' to the next
2045
+ * loop iteration instead of doing a 'break', whose only purpose
2046
+ * would be to handle the message. */
2047
+
1958
2048
/* Most case:s use this; overridden in a few */
1959
2049
U32 pack_warn = packWARN (WARN_UTF8 );
1960
2050
@@ -2088,8 +2178,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2088
2178
if (NEED_MESSAGE (WARN_NONCHAR ,,)) {
2089
2179
/* The code above should have guaranteed that we don't
2090
2180
* get here with errors other than overlong */
2091
- assert (! (orig_problems
2092
- & ~(UTF8_GOT_LONG |UTF8_GOT_NONCHAR )));
2181
+ assert (! ( orig_problems
2182
+ & ~(UTF8_GOT_LONG |UTF8_GOT_NONCHAR )));
2093
2183
2094
2184
pack_warn = packWARN (WARN_NONCHAR );
2095
2185
message = Perl_form (aTHX_ nonchar_cp_format , uv );
@@ -2301,8 +2391,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2301
2391
}
2302
2392
2303
2393
av_push (* msgs , newRV_noinc ((SV * ) new_msg_hv (message ,
2304
- pack_warn ,
2305
- this_flag_bit )));
2394
+ pack_warn ,
2395
+ this_flag_bit )));
2306
2396
}
2307
2397
else if (PL_op )
2308
2398
Perl_warner (aTHX_ pack_warn , "%s in %s" , message ,
@@ -2323,7 +2413,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2323
2413
success = false;
2324
2414
uv = UNICODE_REPLACEMENT ;
2325
2415
}
2326
- }
2416
+ } /* End of there was a possible problem */
2327
2417
2328
2418
* cp_p = UNI_TO_NATIVE (uv );
2329
2419
return success ;
0 commit comments