Skip to content

[mypyc] Implement str.lower() and str.upper() primitive #19375

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mypyc/lib-rt/CPy.h
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,8 @@ PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors);
Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start);
Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end);
CPyTagged CPyStr_Ord(PyObject *obj);
PyObject *CPyStr_Lower(PyObject *self);
PyObject *CPyStr_Upper(PyObject *self);


// Bytes operations
Expand Down
76 changes: 76 additions & 0 deletions mypyc/lib-rt/str_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -546,3 +546,79 @@ CPyTagged CPyStr_Ord(PyObject *obj) {
PyExc_TypeError, "ord() expected a character, but a string of length %zd found", s);
return CPY_INT_TAG;
}

PyObject *CPyStr_Lower(PyObject *self) {
if (PyUnicode_READY(self) == -1)
return NULL;

Py_ssize_t len = PyUnicode_GET_LENGTH(self);

// Fast path: ASCII only
if (PyUnicode_IS_ASCII(self)) {
PyObject *res = PyUnicode_New(len, 127);
if (res == NULL)
return NULL;
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res);
for (Py_ssize_t i = 0; i < len; i++) {
res_data[i] = Py_TOLOWER((unsigned char) data[i]);
}
return res;
}

// General Unicode path
int kind = PyUnicode_KIND(self);
void *data = PyUnicode_DATA(self);
Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
PyObject *res = PyUnicode_New(len, maxchar);
if (res == NULL)
return NULL;
int res_kind = PyUnicode_KIND(res);
void *res_data = PyUnicode_DATA(res);

// Unified loop for all Unicode kinds
for (Py_ssize_t i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Py_UCS4 rch = Py_UNICODE_TOLOWER(ch);
PyUnicode_WRITE(res_kind, res_data, i, rch);
}
return res;
}

PyObject *CPyStr_Upper(PyObject *self) {
if (PyUnicode_READY(self) == -1)
return NULL;

Py_ssize_t len = PyUnicode_GET_LENGTH(self);

// Fast path: ASCII only
if (PyUnicode_IS_ASCII(self)) {
PyObject *res = PyUnicode_New(len, 127);
if (res == NULL)
return NULL;
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res);
for (Py_ssize_t i = 0; i < len; i++) {
res_data[i] = Py_TOUPPER((unsigned char) data[i]);
}
return res;
}

// General Unicode path
int kind = PyUnicode_KIND(self);
void *data = PyUnicode_DATA(self);
Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
PyObject *res = PyUnicode_New(len, maxchar);
if (res == NULL)
return NULL;
int res_kind = PyUnicode_KIND(res);
void *res_data = PyUnicode_DATA(res);

// Unified loop for all Unicode kinds
for (Py_ssize_t i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Py_UCS4 rch = Py_UNICODE_TOUPPER(ch);
PyUnicode_WRITE(res_kind, res_data, i, rch);
}
return res;
}
18 changes: 18 additions & 0 deletions mypyc/primitives/str_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,3 +428,21 @@
c_function_name="CPyStr_Ord",
error_kind=ERR_MAGIC,
)

# str.lower()
method_op(
name="lower",
arg_types=[str_rprimitive],
return_type=str_rprimitive,
c_function_name="CPyStr_Lower",
error_kind=ERR_MAGIC,
)

# str.upper()
method_op(
name="upper",
arg_types=[str_rprimitive],
return_type=str_rprimitive,
c_function_name="CPyStr_Upper",
error_kind=ERR_MAGIC,
)
3 changes: 2 additions & 1 deletion mypyc/test-data/fixtures/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def lstrip(self, item: Optional[str] = None) -> str: pass
def rstrip(self, item: Optional[str] = None) -> str: pass
def join(self, x: Iterable[str]) -> str: pass
def format(self, *args: Any, **kwargs: Any) -> str: ...
def upper(self) -> str: ...
def startswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...
def endswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...
def replace(self, old: str, new: str, maxcount: int=...) -> str: ...
Expand All @@ -122,6 +121,8 @@ def rpartition(self, sep: str, /) -> Tuple[str, str, str]: ...
def removeprefix(self, prefix: str, /) -> str: ...
def removesuffix(self, suffix: str, /) -> str: ...
def islower(self) -> bool: ...
def lower(self) -> str: ...
def upper(self) -> str: ...

class float:
def __init__(self, x: object) -> None: pass
Expand Down
20 changes: 20 additions & 0 deletions mypyc/test-data/irbuild-str.test
Original file line number Diff line number Diff line change
Expand Up @@ -562,3 +562,23 @@ L0:
r3 = box(native_int, r1)
r4 = unbox(int, r3)
return r4

[case testLower]
def do_lower(s: str) -> str:
return s.lower()
[out]
def do_lower(s):
s, r0 :: str
L0:
r0 = CPyStr_Lower(s)
return r0

[case testUpper]
def do_upper(s: str) -> str:
return s.upper()
[out]
def do_upper(s):
s, r0 :: str
L0:
r0 = CPyStr_Upper(s)
return r0
27 changes: 27 additions & 0 deletions mypyc/test-data/run-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -906,3 +906,30 @@ def test_count_multi_start_end_emoji() -> None:
assert string.count("😴😴😴", 0, 12) == 1, string.count("😴😴😴", 0, 12)
assert string.count("🚀🚀🚀", 0, 12) == 2, string.count("🚀🚀🚀", 0, 12)
assert string.count("ñññ", 0, 12) == 1, string.count("ñññ", 0, 12)

[case testLower]
def test_str_lower() -> None:
assert "".lower() == ""
assert "ABC".lower() == "abc"
assert "abc".lower() == "abc"
assert "AbC123".lower() == "abc123"
assert "áÉÍ".lower() == "áéí"
assert "😴🚀".lower() == "😴🚀"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test special cases (verify that this agrees with normal Python semantics):

  • 'SS'.lower() == 'ss'
  • 'Σ'.lower()
  • 'İ'.lower() (changes length!)

# Special
assert "SS".lower() == "ss"
assert "Σ".lower() == "σ" # Greek capital sigma -> small sigma
#assert "İ".lower() == "i̇" # TODO: Latin capital letter I with dot above -> 'i' + combining dot
#assert len("İ".lower()) == 2 # TODO: Confirms length change

[case testUpper]
def test_str_upper() -> None:
assert "".upper() == ""
assert "abc".upper() == "ABC"
assert "ABC".upper() == "ABC"
assert "AbC123".upper() == "ABC123"
assert "áéí".upper() == "ÁÉÍ"
assert "😴🚀".upper() == "😴🚀"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test special case (verify that this agrees with normal Python semantics):

  • 'ß'.upper() == 'SS'
  • 'ffi'.upper() (length increases!)

# Special
#assert "ß".upper() == "SS" # TODO: German sharp S -> double S
#assert "ffi".upper() == "FFI" # TODO: Ligature 'ffi' -> separate letters
#assert len("ffi".upper()) == 3 # TODO: Confirm length increases