Skip to content
This repository was archived by the owner on Jul 17, 2024. It is now read-only.

fix: Case handling in byte and bytearray methods, converting unicode to ascii array #80

Merged
merged 3 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,8 @@ public static PythonByteArray fromIntTuple(PythonLikeTuple tuple) {
}

public final PythonLikeTuple asIntTuple() {
return IntStream.range(0, valueBuffer.limit()).mapToObj(index -> PythonBytes.BYTE_TO_INT[valueBuffer.get(index) & 0xFF])
return IntStream.range(0, valueBuffer.limit())
.mapToObj(index -> PythonBytes.BYTE_TO_INT[Byte.toUnsignedInt(valueBuffer.get(index))])
.collect(Collectors.toCollection(PythonLikeTuple::new));
}

Expand All @@ -364,7 +365,7 @@ public PythonInteger getCharAt(PythonInteger position) {
throw new IndexError("position " + position + " is less than 0");
}

return PythonBytes.BYTE_TO_INT[valueBuffer.get(index) & 0xFF];
return PythonBytes.BYTE_TO_INT[Byte.toUnsignedInt(valueBuffer.get(index))];
}

public PythonByteArray getSubsequence(PythonSlice slice) {
Expand Down Expand Up @@ -435,7 +436,7 @@ public PythonByteArray repeat(PythonInteger times) {

public DelegatePythonIterator<PythonInteger> getIterator() {
return new DelegatePythonIterator<>(IntStream.range(0, valueBuffer.limit())
.mapToObj(index -> PythonBytes.BYTE_TO_INT[valueBuffer.get(index)])
.mapToObj(index -> PythonBytes.BYTE_TO_INT[Byte.toUnsignedInt(valueBuffer.get(index))])
.iterator());
}

Expand Down Expand Up @@ -707,7 +708,7 @@ public PythonInteger pop() {
if (valueBuffer.limit() == 0) {
throw new IndexError("pop from empty bytearray");
}
PythonInteger out = PythonBytes.BYTE_TO_INT[valueBuffer.get(valueBuffer.limit() - 1) & 0xFF];
PythonInteger out = PythonBytes.BYTE_TO_INT[Byte.toUnsignedInt(valueBuffer.get(valueBuffer.limit() - 1))];
valueBuffer.limit(valueBuffer.limit() - 1);
return out;
}
Expand All @@ -721,7 +722,7 @@ public PythonInteger pop(PythonInteger index) {
if (indexAsInt < 0 || indexAsInt > valueBuffer.limit()) {
throw new IndexError("index out of range for bytearray");
}
PythonInteger out = PythonBytes.BYTE_TO_INT[valueBuffer.get(indexAsInt) & 0xFF];
PythonInteger out = PythonBytes.BYTE_TO_INT[Byte.toUnsignedInt(valueBuffer.get(indexAsInt))];
removeBytesStartingAt(indexAsInt, 1);
return out;
}
Expand Down Expand Up @@ -1824,7 +1825,17 @@ public PythonLikeList<PythonByteArray> rightSplit(PythonNone seperator, PythonIn
}

public PythonByteArray capitalize() {
return asAsciiString().capitalize().asAsciiByteArray();
var asString = asAsciiString();
if (asString.value.isEmpty()) {
return asString.asAsciiByteArray();
}
var tail = PythonString.valueOf(asString.value.substring(1))
.withModifiedCodepoints(cp -> cp < 128 ? Character.toLowerCase(cp) : cp).value;
var head = asString.value.charAt(0);
if (head < 128) {
head = Character.toTitleCase(head);
}
return (PythonString.valueOf(head + tail)).asAsciiByteArray();
}

public PythonByteArray expandTabs() {
Expand Down Expand Up @@ -1874,7 +1885,8 @@ public PythonBoolean isUpper() {
}

public PythonByteArray lower() {
return asAsciiString().lower().asAsciiByteArray();
return asAsciiString().withModifiedCodepoints(
cp -> cp < 128 ? Character.toLowerCase(cp) : cp).asAsciiByteArray();
}

public PythonLikeList<PythonByteArray> splitLines() {
Expand All @@ -1892,15 +1904,17 @@ public PythonLikeList<PythonByteArray> splitLines(PythonBoolean keepEnds) {
}

public PythonByteArray swapCase() {
return asAsciiString().swapCase().asAsciiByteArray();
return asAsciiString().withModifiedCodepoints(
cp -> cp < 128 ? PythonString.CharacterCase.swapCase(cp) : cp).asAsciiByteArray();
}

public PythonByteArray title() {
return asAsciiString().title().asAsciiByteArray();
return asAsciiString().title(cp -> cp < 128).asAsciiByteArray();
}

public PythonByteArray upper() {
return asAsciiString().upper().asAsciiByteArray();
return asAsciiString().withModifiedCodepoints(
cp -> cp < 128 ? Character.toUpperCase(cp) : cp).asAsciiByteArray();
}

public PythonByteArray zfill(PythonInteger width) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ public static PythonBytes fromIntTuple(PythonLikeTuple tuple) {
}

public final PythonLikeTuple asIntTuple() {
return IntStream.range(0, value.length).mapToObj(index -> BYTE_TO_INT[value[index]])
return IntStream.range(0, value.length).mapToObj(index -> BYTE_TO_INT[Byte.toUnsignedInt(value[index])])
.collect(Collectors.toCollection(PythonLikeTuple::new));
}

Expand All @@ -397,7 +397,7 @@ public PythonInteger getCharAt(PythonInteger position) {
throw new IndexError("position " + position + " is less than 0");
}

return BYTE_TO_INT[value[index] & 0xFF];
return BYTE_TO_INT[Byte.toUnsignedInt(value[index])];
}

public PythonBytes getSubsequence(PythonSlice slice) {
Expand Down Expand Up @@ -472,7 +472,7 @@ public PythonBytes repeat(PythonInteger times) {

public DelegatePythonIterator<PythonInteger> getIterator() {
return new DelegatePythonIterator<>(IntStream.range(0, value.length)
.mapToObj(index -> BYTE_TO_INT[value[index]])
.mapToObj(index -> BYTE_TO_INT[Byte.toUnsignedInt(value[index])])
.iterator());
}

Expand Down Expand Up @@ -1597,7 +1597,17 @@ public PythonLikeList<PythonBytes> rightSplit(PythonNone seperator, PythonIntege
}

public PythonBytes capitalize() {
return asAsciiString().capitalize().asAsciiBytes();
var asString = asAsciiString();
if (asString.value.isEmpty()) {
return this;
}
var tail = PythonString.valueOf(asString.value.substring(1))
.withModifiedCodepoints(cp -> cp < 128 ? Character.toLowerCase(cp) : cp).value;
var head = asString.value.charAt(0);
if (head < 128) {
head = Character.toTitleCase(head);
}
return (PythonString.valueOf(head + tail)).asAsciiBytes();
}

public PythonBytes expandTabs() {
Expand Down Expand Up @@ -1646,7 +1656,8 @@ public PythonBoolean isUpper() {
}

public PythonBytes lower() {
return asAsciiString().lower().asAsciiBytes();
return asAsciiString().withModifiedCodepoints(
cp -> cp < 128 ? Character.toLowerCase(cp) : cp).asAsciiBytes();
}

public PythonLikeList<PythonBytes> splitLines() {
Expand All @@ -1664,15 +1675,17 @@ public PythonLikeList<PythonBytes> splitLines(PythonBoolean keepEnds) {
}

public PythonBytes swapCase() {
return asAsciiString().swapCase().asAsciiBytes();
return asAsciiString().withModifiedCodepoints(
cp -> cp < 128 ? PythonString.CharacterCase.swapCase(cp) : cp).asAsciiBytes();
}

public PythonBytes title() {
return asAsciiString().title().asAsciiBytes();
return asAsciiString().title(cp -> cp < 128).asAsciiBytes();
}

public PythonBytes upper() {
return asAsciiString().upper().asAsciiBytes();
return asAsciiString().withModifiedCodepoints(
cp -> cp < 128 ? Character.toUpperCase(cp) : cp).asAsciiBytes();
}

public PythonBytes zfill(PythonInteger width) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import java.util.List;
import java.util.Map;
import java.util.function.IntPredicate;
import java.util.function.IntUnaryOperator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -293,8 +294,9 @@ public final PythonBytes asAsciiBytes() {
outIndex++;
} else {
out[outIndex] = (byte) ((charDatum & 0xFF00) >> 8);
outIndex++;
out[outIndex] = (byte) (charDatum & 0x00FF);
outIndex += 2;
outIndex++;
}
}
return new PythonBytes(out);
Expand Down Expand Up @@ -447,6 +449,10 @@ public PythonString capitalize() {
}

public PythonString title() {
return title(ignored -> true);
}

public PythonString title(IntPredicate predicate) {
if (value.isEmpty()) {
return this;
}
Expand All @@ -458,10 +464,14 @@ public PythonString title() {
for (int i = 0; i < length; i++) {
char character = value.charAt(i);

if (previousIsWordBoundary) {
out.append(Character.toTitleCase(character));
if (predicate.test(character)) {
if (previousIsWordBoundary) {
out.append(Character.toTitleCase(character));
} else {
out.append(Character.toLowerCase(character));
}
} else {
out.append(Character.toLowerCase(character));
out.append(character);
}

previousIsWordBoundary = !Character.isAlphabetic(character);
Expand All @@ -476,11 +486,7 @@ public PythonString casefold() {
}

public PythonString swapCase() {
return PythonString.valueOf(value.codePoints()
.map(CharacterCase::swapCase)
.collect(StringBuilder::new,
StringBuilder::appendCodePoint, StringBuilder::append)
.toString());
return withModifiedCodepoints(CharacterCase::swapCase);
}

public PythonString lower() {
Expand All @@ -491,6 +497,14 @@ public PythonString upper() {
return PythonString.valueOf(value.toUpperCase());
}

public PythonString withModifiedCodepoints(IntUnaryOperator modifier) {
return PythonString.valueOf(value.codePoints()
.map(modifier)
.collect(StringBuilder::new,
StringBuilder::appendCodePoint, StringBuilder::append)
.toString());
}

public PythonString center(PythonInteger width) {
return center(width, PythonString.valueOf(" "));
}
Expand Down Expand Up @@ -1043,7 +1057,7 @@ public PythonBoolean isUpper() {
}
}

private enum CharacterCase {
enum CharacterCase {
UNCASED,
LOWER,
UPPER;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package ai.timefold.jpyinterpreter.types;

import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.*;

import org.junit.jupiter.api.Test;

class PythonStringTest {

// Other methods are tested in test_str.py
// These methods are tested here since they are internal,
// and has edge cases CPython won't hit

@Test
void asAsciiBytes() {
var simple = PythonString.valueOf("abc");
assertThat(simple.asAsciiBytes().asByteArray()).isEqualTo(new byte[] { 'a', 'b', 'c' });

var unicode = PythonString.valueOf("π");
// UTF-16 encoding
assertThat(unicode.asAsciiBytes().asByteArray()).isEqualTo(new byte[] { (byte) 0x03, (byte) 0xC0 });

var mixed = PythonString.valueOf("aπc");
// UTF-16 encoding
assertThat(mixed.asAsciiBytes().asByteArray()).isEqualTo(new byte[] { 'a', (byte) 0x03, (byte) 0xC0, 'c' });
}

@Test
void asAsciiByteArray() {
var simple = PythonString.valueOf("abc");
assertThat(simple.asAsciiByteArray().asByteArray()).isEqualTo(new byte[] { 'a', 'b', 'c' });

var unicode = PythonString.valueOf("π");
// UTF-16 encoding
assertThat(unicode.asAsciiByteArray().asByteArray()).isEqualTo(new byte[] { (byte) 0x03, (byte) 0xC0 });

var mixed = PythonString.valueOf("aπc");
// UTF-16 encoding
assertThat(mixed.asAsciiByteArray().asByteArray()).isEqualTo(new byte[] { 'a', (byte) 0x03, (byte) 0xC0, 'c' });
}
}
4 changes: 4 additions & 0 deletions jpyinterpreter/tests/test_bytearray.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,7 @@ def capitalize(tested: bytearray) -> bytearray:
capitalize_verifier.verify(bytearray(b'hello world'), expected_result=bytearray(b'Hello world'))
capitalize_verifier.verify(bytearray(b'Hello World'), expected_result=bytearray(b'Hello world'))
capitalize_verifier.verify(bytearray(b'HELLO WORLD'), expected_result=bytearray(b'Hello world'))
capitalize_verifier.verify(bytearray('π'.encode()), expected_result=bytearray('π'.encode()))


def test_center():
Expand Down Expand Up @@ -915,6 +916,7 @@ def lower(tested: bytearray) -> bytearray:
lower_verifier.verify(bytearray(b'[]'), expected_result=bytearray(b'[]'))
lower_verifier.verify(bytearray(b'-'), expected_result=bytearray(b'-'))
lower_verifier.verify(bytearray(b'%'), expected_result=bytearray(b'%'))
lower_verifier.verify(bytearray('π'.encode()), expected_result=bytearray('π'.encode()))
lower_verifier.verify(bytearray(b'\n'), expected_result=bytearray(b'\n'))
lower_verifier.verify(bytearray(b'\t'), expected_result=bytearray(b'\t'))
lower_verifier.verify(bytearray(b' '), expected_result=bytearray(b' '))
Expand Down Expand Up @@ -1273,6 +1275,7 @@ def swapcase(tested: bytearray) -> bytearray:
swapcase_verifier.verify(bytearray(b'[]'), expected_result=bytearray(b'[]'))
swapcase_verifier.verify(bytearray(b'-'), expected_result=bytearray(b'-'))
swapcase_verifier.verify(bytearray(b'%'), expected_result=bytearray(b'%'))
swapcase_verifier.verify(bytearray('π'.encode()), expected_result=bytearray('π'.encode()))
swapcase_verifier.verify(bytearray(b'\n'), expected_result=bytearray(b'\n'))
swapcase_verifier.verify(bytearray(b'\t'), expected_result=bytearray(b'\t'))
swapcase_verifier.verify(bytearray(b' '), expected_result=bytearray(b' '))
Expand All @@ -1297,6 +1300,7 @@ def title(tested: bytearray) -> bytearray:
title_verifier.verify(bytearray(b'[]'), expected_result=bytearray(b'[]'))
title_verifier.verify(bytearray(b'-'), expected_result=bytearray(b'-'))
title_verifier.verify(bytearray(b'%'), expected_result=bytearray(b'%'))
title_verifier.verify(bytearray('π'.encode()), expected_result=bytearray('π'.encode()))
title_verifier.verify(bytearray(b'\n'), expected_result=bytearray(b'\n'))
title_verifier.verify(bytearray(b'\t'), expected_result=bytearray(b'\t'))
title_verifier.verify(bytearray(b' '), expected_result=bytearray(b' '))
Expand Down
5 changes: 5 additions & 0 deletions jpyinterpreter/tests/test_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def capitalize(tested: bytes) -> bytes:
capitalize_verifier.verify(b'hello world', expected_result=b'Hello world')
capitalize_verifier.verify(b'Hello World', expected_result=b'Hello world')
capitalize_verifier.verify(b'HELLO WORLD', expected_result=b'Hello world')
capitalize_verifier.verify('π'.encode(), expected_result='π'.encode())


def test_center():
Expand Down Expand Up @@ -647,6 +648,7 @@ def lower(tested: bytes) -> bytes:
lower_verifier.verify(b'[]', expected_result=b'[]')
lower_verifier.verify(b'-', expected_result=b'-')
lower_verifier.verify(b'%', expected_result=b'%')
lower_verifier.verify('π'.encode(), expected_result='π'.encode())
lower_verifier.verify(b'\n', expected_result=b'\n')
lower_verifier.verify(b'\t', expected_result=b'\t')
lower_verifier.verify(b' ', expected_result=b' ')
Expand Down Expand Up @@ -1005,6 +1007,7 @@ def swapcase(tested: bytes) -> bytes:
swapcase_verifier.verify(b'[]', expected_result=b'[]')
swapcase_verifier.verify(b'-', expected_result=b'-')
swapcase_verifier.verify(b'%', expected_result=b'%')
swapcase_verifier.verify('π'.encode(), expected_result='π'.encode())
swapcase_verifier.verify(b'\n', expected_result=b'\n')
swapcase_verifier.verify(b'\t', expected_result=b'\t')
swapcase_verifier.verify(b' ', expected_result=b' ')
Expand All @@ -1029,6 +1032,7 @@ def title(tested: bytes) -> bytes:
title_verifier.verify(b'[]', expected_result=b'[]')
title_verifier.verify(b'-', expected_result=b'-')
title_verifier.verify(b'%', expected_result=b'%')
title_verifier.verify('π'.encode(), expected_result='π'.encode())
title_verifier.verify(b'\n', expected_result=b'\n')
title_verifier.verify(b'\t', expected_result=b'\t')
title_verifier.verify(b' ', expected_result=b' ')
Expand Down Expand Up @@ -1061,6 +1065,7 @@ def upper(tested: bytes) -> bytes:
upper_verifier.verify(b'[]', expected_result=b'[]')
upper_verifier.verify(b'-', expected_result=b'-')
upper_verifier.verify(b'%', expected_result=b'%')
upper_verifier.verify('π'.encode(), expected_result='π'.encode())
upper_verifier.verify(b'\n', expected_result=b'\n')
upper_verifier.verify(b'\t', expected_result=b'\t')
upper_verifier.verify(b' ', expected_result=b' ')
Expand Down
Loading