diff --git a/mypyc/lib-rt/CPy.h b/mypyc/lib-rt/CPy.h index bdf3e0130a4c..3ec17999d512 100644 --- a/mypyc/lib-rt/CPy.h +++ b/mypyc/lib-rt/CPy.h @@ -756,6 +756,8 @@ PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors); Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start); Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end); CPyTagged CPyStr_Ord(PyObject *obj); +PyObject *CPyStr_Lower(PyObject *self); +PyObject *CPyStr_Upper(PyObject *self); // Bytes operations diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c index 210172c57497..df9d36c21c93 100644 --- a/mypyc/lib-rt/str_ops.c +++ b/mypyc/lib-rt/str_ops.c @@ -546,3 +546,79 @@ CPyTagged CPyStr_Ord(PyObject *obj) { PyExc_TypeError, "ord() expected a character, but a string of length %zd found", s); return CPY_INT_TAG; } + +PyObject *CPyStr_Lower(PyObject *self) { + if (PyUnicode_READY(self) == -1) + return NULL; + + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + + // Fast path: ASCII only + if (PyUnicode_IS_ASCII(self)) { + PyObject *res = PyUnicode_New(len, 127); + if (res == NULL) + return NULL; + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); + Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = 0; i < len; i++) { + res_data[i] = Py_TOLOWER((unsigned char) data[i]); + } + return res; + } + + // General Unicode path + int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); + PyObject *res = PyUnicode_New(len, maxchar); + if (res == NULL) + return NULL; + int res_kind = PyUnicode_KIND(res); + void *res_data = PyUnicode_DATA(res); + + // Unified loop for all Unicode kinds + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 rch = Py_UNICODE_TOLOWER(ch); + PyUnicode_WRITE(res_kind, res_data, i, rch); + } + return res; +} + +PyObject *CPyStr_Upper(PyObject *self) { + if (PyUnicode_READY(self) == -1) + return NULL; + + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + + // Fast path: ASCII only + if (PyUnicode_IS_ASCII(self)) { + PyObject *res = PyUnicode_New(len, 127); + if (res == NULL) + return NULL; + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); + Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = 0; i < len; i++) { + res_data[i] = Py_TOUPPER((unsigned char) data[i]); + } + return res; + } + + // General Unicode path + int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); + PyObject *res = PyUnicode_New(len, maxchar); + if (res == NULL) + return NULL; + int res_kind = PyUnicode_KIND(res); + void *res_data = PyUnicode_DATA(res); + + // Unified loop for all Unicode kinds + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 rch = Py_UNICODE_TOUPPER(ch); + PyUnicode_WRITE(res_kind, res_data, i, rch); + } + return res; +} diff --git a/mypyc/primitives/str_ops.py b/mypyc/primitives/str_ops.py index 9d46da9c3514..b55ed5be3e53 100644 --- a/mypyc/primitives/str_ops.py +++ b/mypyc/primitives/str_ops.py @@ -428,3 +428,21 @@ c_function_name="CPyStr_Ord", error_kind=ERR_MAGIC, ) + +# str.lower() +method_op( + name="lower", + arg_types=[str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPyStr_Lower", + error_kind=ERR_MAGIC, +) + +# str.upper() +method_op( + name="upper", + arg_types=[str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPyStr_Upper", + error_kind=ERR_MAGIC, +) diff --git a/mypyc/test-data/fixtures/ir.py b/mypyc/test-data/fixtures/ir.py index 532cbbc06177..5b90ca00a51c 100644 --- a/mypyc/test-data/fixtures/ir.py +++ b/mypyc/test-data/fixtures/ir.py @@ -112,7 +112,6 @@ def lstrip(self, item: Optional[str] = None) -> str: pass def rstrip(self, item: Optional[str] = None) -> str: pass def join(self, x: Iterable[str]) -> str: pass def format(self, *args: Any, **kwargs: Any) -> str: ... - def upper(self) -> str: ... def startswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ... def endswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ... def replace(self, old: str, new: str, maxcount: int=...) -> str: ... @@ -122,6 +121,8 @@ def rpartition(self, sep: str, /) -> Tuple[str, str, str]: ... def removeprefix(self, prefix: str, /) -> str: ... def removesuffix(self, suffix: str, /) -> str: ... def islower(self) -> bool: ... + def lower(self) -> str: ... + def upper(self) -> str: ... class float: def __init__(self, x: object) -> None: pass diff --git a/mypyc/test-data/irbuild-str.test b/mypyc/test-data/irbuild-str.test index 2bf77a6cb556..1bc4fa25fb37 100644 --- a/mypyc/test-data/irbuild-str.test +++ b/mypyc/test-data/irbuild-str.test @@ -562,3 +562,23 @@ L0: r3 = box(native_int, r1) r4 = unbox(int, r3) return r4 + +[case testLower] +def do_lower(s: str) -> str: + return s.lower() +[out] +def do_lower(s): + s, r0 :: str +L0: + r0 = CPyStr_Lower(s) + return r0 + +[case testUpper] +def do_upper(s: str) -> str: + return s.upper() +[out] +def do_upper(s): + s, r0 :: str +L0: + r0 = CPyStr_Upper(s) + return r0 diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test index 074e56f9068a..f9edd98b4200 100644 --- a/mypyc/test-data/run-strings.test +++ b/mypyc/test-data/run-strings.test @@ -906,3 +906,30 @@ def test_count_multi_start_end_emoji() -> None: assert string.count("๐Ÿ˜ด๐Ÿ˜ด๐Ÿ˜ด", 0, 12) == 1, string.count("๐Ÿ˜ด๐Ÿ˜ด๐Ÿ˜ด", 0, 12) assert string.count("๐Ÿš€๐Ÿš€๐Ÿš€", 0, 12) == 2, string.count("๐Ÿš€๐Ÿš€๐Ÿš€", 0, 12) assert string.count("รฑรฑรฑ", 0, 12) == 1, string.count("รฑรฑรฑ", 0, 12) + +[case testLower] +def test_str_lower() -> None: + assert "".lower() == "" + assert "ABC".lower() == "abc" + assert "abc".lower() == "abc" + assert "AbC123".lower() == "abc123" + assert "รกร‰ร".lower() == "รกรฉรญ" + assert "๐Ÿ˜ด๐Ÿš€".lower() == "๐Ÿ˜ด๐Ÿš€" + # Special + assert "SS".lower() == "ss" + assert "ฮฃ".lower() == "ฯƒ" # Greek capital sigma -> small sigma + #assert "ฤฐ".lower() == "iฬ‡" # TODO: Latin capital letter I with dot above -> 'i' + combining dot + #assert len("ฤฐ".lower()) == 2 # TODO: Confirms length change + +[case testUpper] +def test_str_upper() -> None: + assert "".upper() == "" + assert "abc".upper() == "ABC" + assert "ABC".upper() == "ABC" + assert "AbC123".upper() == "ABC123" + assert "รกรฉรญ".upper() == "รร‰ร" + assert "๐Ÿ˜ด๐Ÿš€".upper() == "๐Ÿ˜ด๐Ÿš€" + # Special + #assert "รŸ".upper() == "SS" # TODO: German sharp S -> double S + #assert "๏ฌƒ".upper() == "FFI" # TODO: Ligature 'ffi' -> separate letters + #assert len("๏ฌƒ".upper()) == 3 # TODO: Confirm length increases