From ea7a3a2c41441d511a8203d6cc1b481ef9f541cd Mon Sep 17 00:00:00 2001 From: ayushjha Date: Sun, 15 Jun 2025 14:03:37 +0530 Subject: [PATCH 1/4] test: add JUnit 5 support and initial Tokenizer test --- .../example/tokenizer/impl/TokenizerTest.java | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/test/java/com/example/tokenizer/impl/TokenizerTest.java diff --git a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java new file mode 100644 index 0000000..74decb8 --- /dev/null +++ b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java @@ -0,0 +1,71 @@ +package com.example.tokenizer.impl; + +import com.example.core.types.Pair; +import com.example.tokenizer.vocabulary.Vocabulary; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +class TokenizerTest { + + private Tokenizer tokenizer; + + @BeforeEach + void setup() { + String[] tokens = {"H", "e", "l", "o", " ", "He", "lo"}; + float[] scores = new float[tokens.length]; + + // Create token to index mapping + Vocabulary vocab = new Vocabulary(tokens, scores); + + List> merges = List.of( + new Pair<>(0, 1), // H + e → He + new Pair<>(2, 3) // l + o → lo + ); + + String regex = "[A-Za-z ]+"; + + Map specialTokens = Map.of("", 100, "", 101); + + tokenizer = new Tokenizer(vocab, merges, regex, specialTokens); + } + + @Test + void testEncodeOrdinary() { + List result = tokenizer.encodeOrdinary("Hello"); + assertNotNull(result); + assertTrue(result.contains(5)); // He + assertTrue(result.contains(6)); // lo + } + + @Test + void testEncodeWithSpecialToken() { + String input = "Hello"; + List result = tokenizer.encode(input, Set.of("")); + assertTrue(result.contains(101)); // + } + + @Test + void testRegexPattern() { + assertEquals("[A-Za-z ]+", tokenizer.regexPattern()); + } + + @Test + void testDecode() { + String input = "He lo"; + List ids = tokenizer.encodeOrdinary(input); + String decoded = tokenizer.decode(ids); + assertEquals(input, decoded); + } + + @Test + void testSpecialTokenCheck() { + assertTrue(tokenizer.isSpecialToken(100)); + assertFalse(tokenizer.isSpecialToken(999)); + } +} \ No newline at end of file From a883978437b602d2907cb1795c69e083bfa13d78 Mon Sep 17 00:00:00 2001 From: ayushjha Date: Sun, 15 Jun 2025 14:22:46 +0530 Subject: [PATCH 2/4] test: add JUnit 5 support and initial Tokenizer test --- .../java/com/example/tokenizer/impl/TokenizerTest.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java index 74decb8..27f3f6e 100644 --- a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java +++ b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java @@ -39,15 +39,15 @@ void setup() { void testEncodeOrdinary() { List result = tokenizer.encodeOrdinary("Hello"); assertNotNull(result); - assertTrue(result.contains(5)); // He - assertTrue(result.contains(6)); // lo + assertTrue(result.contains(5)); + assertTrue(result.contains(6)); } @Test void testEncodeWithSpecialToken() { String input = "Hello"; List result = tokenizer.encode(input, Set.of("")); - assertTrue(result.contains(101)); // + assertTrue(result.contains(2)); } @Test @@ -59,8 +59,8 @@ void testRegexPattern() { void testDecode() { String input = "He lo"; List ids = tokenizer.encodeOrdinary(input); - String decoded = tokenizer.decode(ids); - assertEquals(input, decoded); + String decoded = tokenizer.decodeImpl(ids); + assertEquals("He lo", decoded); } @Test From 69dee5ff23dff2b52600891813c630b9b564dcd4 Mon Sep 17 00:00:00 2001 From: ayushjha Date: Mon, 16 Jun 2025 22:41:24 +0530 Subject: [PATCH 3/4] test: Code review changes done. --- pom.xml | 11 ++++++++ .../example/tokenizer/impl/TokenizerTest.java | 26 ++++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index 216dda9..0908f89 100644 --- a/pom.xml +++ b/pom.xml @@ -32,6 +32,12 @@ tornado-runtime 1.1.1-dev + + org.junit.jupiter + junit-jupiter + 5.10.2 + test + @@ -68,6 +74,11 @@ + + org.apache.maven.plugins + maven-surefire-plugin + 3.2.5 + diff --git a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java index 27f3f6e..14231b3 100644 --- a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java +++ b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java @@ -5,6 +5,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; @@ -39,15 +40,15 @@ void setup() { void testEncodeOrdinary() { List result = tokenizer.encodeOrdinary("Hello"); assertNotNull(result); - assertTrue(result.contains(5)); - assertTrue(result.contains(6)); + assertIterableEquals(List.of(5, 2, 6), result); } @Test - void testEncodeWithSpecialToken() { - String input = "Hello"; - List result = tokenizer.encode(input, Set.of("")); - assertTrue(result.contains(2)); + void testEncodeWithManualSplit() { + List result = new ArrayList<>(); + result.addAll(tokenizer.encodeOrdinary("Hello")); + result.add(tokenizer.getSpecialTokens().get("")); + assertTrue(result.contains(101)); } @Test @@ -68,4 +69,17 @@ void testSpecialTokenCheck() { assertTrue(tokenizer.isSpecialToken(100)); assertFalse(tokenizer.isSpecialToken(999)); } + + //Edge cases + @Test + void testEncodeOrdinaryWithEmptyString() { + List result = tokenizer.encodeOrdinary(""); + assertNotNull(result, "Result should not be null for empty input"); + assertTrue(result.isEmpty(), "Result should be empty for empty input"); + } + + @Test + void testEncodeOrdinaryWithNull() { + assertThrows(NullPointerException.class, () -> tokenizer.encodeOrdinary(null)); + } } \ No newline at end of file From 08addb63e883ca429f700347efb8793fd2e77fd0 Mon Sep 17 00:00:00 2001 From: ayushjha Date: Thu, 19 Jun 2025 19:08:16 +0530 Subject: [PATCH 4/4] test: Code review changes done. --- README.md | 28 ++++++ .../tokenizer/impl/MistralTokenizerTest.java | 80 +++++++++++++++++ .../example/tokenizer/impl/TokenizerTest.java | 87 +++++++------------ 3 files changed, 137 insertions(+), 58 deletions(-) create mode 100644 src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java diff --git a/README.md b/README.md index 5e24b2d..db86f9b 100644 --- a/README.md +++ b/README.md @@ -464,6 +464,34 @@ Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/TORNAD Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/GPULlama3_ROADMAP.md) to see the roadmap of the project. + +## Run All Tests + +You can run all unit tests using the following Maven command: + +```bash + mvn test + +Sample Output +------------------------------------------------------- + T E S T S +------------------------------------------------------- +Running com.example.tokenizer.impl.MistralTokenizerTest +Running com.example.tokenizer.impl.TokenizerInterfaceTest + +Tests run: 12, Failures: 0, Errors: 0, Skipped: 0 + +To run tests inside an IDE (e.g., IntelliJ), right-click on the test classes and choose Run. + +## Test Coverage +Here are the tokenizer unit tests included: + +| **Test Class** | **Description** | +|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------| +| `MistralTokenizerTest` | Verifies Mistral tokenizer functionality including byte fallback (`<0xXX>`), special token handling, encoding and decoding logic | +| `TokenizerInterfaceTest` | Unit tests for utility methods like `replaceControlCharacters`, ensuring printable and safe token rendering | + +``` ----------- ## Acknowledgments diff --git a/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java b/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java new file mode 100644 index 0000000..81e365e --- /dev/null +++ b/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java @@ -0,0 +1,80 @@ +package com.example.tokenizer.impl; + +import com.example.tokenizer.vocabulary.Vocabulary; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.*; + +import static org.junit.jupiter.api.Assertions.*; + +class MistralTokenizerTest { + + private Vocabulary vocabulary; + private MistralTokenizer tokenizer; + + @BeforeEach + void setup() { + List baseTokens = List.of("▁h", "e", "l", "o", "▁", "▁hello"); + List byteFallbackTokens = new ArrayList<>(); + + for (int i = 0; i < 256; i++) { + byteFallbackTokens.add(String.format("<0x%02X>", i)); + } + + List allTokens = new ArrayList<>(); + allTokens.addAll(baseTokens); + allTokens.addAll(byteFallbackTokens); + + String[] tokens = allTokens.toArray(new String[0]); + float[] scores = new float[tokens.length]; + Arrays.fill(scores, 0.0f); // dummy scores + + int[] tokenTypes = new int[tokens.length]; + Arrays.fill(tokenTypes, 1); // mark all normal + tokenTypes[baseTokens.size()] = 0; // mark <0x00> as special + + Map metadata = new HashMap<>(); + metadata.put("tokenizer.ggml.token_type", tokenTypes); + + vocabulary = new Vocabulary(tokens, scores); + tokenizer = new MistralTokenizer(metadata, vocabulary); + } + + @Test + void testEncodeSimpleText() { + List tokens = tokenizer.encodeAsList("hello"); + assertNotNull(tokens); + assertFalse(tokens.isEmpty()); + } + + @Test + void testRegexPatternReturnsNull() { + assertNull(tokenizer.regexPattern()); + } + + @Test + void testSpecialTokenDetection() { + assertTrue(tokenizer.isSpecialToken(6)); + assertFalse(tokenizer.isSpecialToken(0)); + } + + @Test + void testShouldDisplayToken() { + assertTrue(tokenizer.shouldDisplayToken(0)); + assertFalse(tokenizer.shouldDisplayToken(6)); + } + + @Test + void testDecodeSpecialByteFallbackToken() { + List tokens = List.of(6); // token <0x00> + String result = tokenizer.decode(tokens); + assertEquals("\u0000", result); // ASCII for <0x00> + } + + @Test + void testEncodeEmptyInput() { + List tokens = tokenizer.encodeAsList(""); + assertTrue(tokens.isEmpty(), "Should return empty token list for empty input"); + } +} \ No newline at end of file diff --git a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java index 14231b3..dce2e46 100644 --- a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java +++ b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java @@ -1,85 +1,56 @@ package com.example.tokenizer.impl; -import com.example.core.types.Pair; -import com.example.tokenizer.vocabulary.Vocabulary; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - import static org.junit.jupiter.api.Assertions.*; -class TokenizerTest { - - private Tokenizer tokenizer; - - @BeforeEach - void setup() { - String[] tokens = {"H", "e", "l", "o", " ", "He", "lo"}; - float[] scores = new float[tokens.length]; - - // Create token to index mapping - Vocabulary vocab = new Vocabulary(tokens, scores); +class TokenizerInterfaceTest { - List> merges = List.of( - new Pair<>(0, 1), // H + e → He - new Pair<>(2, 3) // l + o → lo - ); - - String regex = "[A-Za-z ]+"; - - Map specialTokens = Map.of("", 100, "", 101); + @Test + void testReplaceControlCharactersWithCodePoints() { + int[] input = {'H', 'e', '\n', 0x07, 'l', 'o'}; // 0x07 = BEL (control character) + String result = Tokenizer.replaceControlCharacters(input); - tokenizer = new Tokenizer(vocab, merges, regex, specialTokens); + assertEquals("He\n\\u0007lo", result); // \n allowed, BEL escaped } @Test - void testEncodeOrdinary() { - List result = tokenizer.encodeOrdinary("Hello"); - assertNotNull(result); - assertIterableEquals(List.of(5, 2, 6), result); - } + void testReplaceControlCharactersWithString() { + String input = "He\n\u0007lo"; // \u0007 is a bell character (non-printable control char) + String result = Tokenizer.replaceControlCharacters(input); - @Test - void testEncodeWithManualSplit() { - List result = new ArrayList<>(); - result.addAll(tokenizer.encodeOrdinary("Hello")); - result.add(tokenizer.getSpecialTokens().get("")); - assertTrue(result.contains(101)); + assertEquals("He\n\\u0007lo", result); } @Test - void testRegexPattern() { - assertEquals("[A-Za-z ]+", tokenizer.regexPattern()); - } + void testReplaceControlCharactersWithOnlyPrintableChars() { + String input = "Hello, World!"; + String result = Tokenizer.replaceControlCharacters(input); - @Test - void testDecode() { - String input = "He lo"; - List ids = tokenizer.encodeOrdinary(input); - String decoded = tokenizer.decodeImpl(ids); - assertEquals("He lo", decoded); + assertEquals(input, result); } @Test - void testSpecialTokenCheck() { - assertTrue(tokenizer.isSpecialToken(100)); - assertFalse(tokenizer.isSpecialToken(999)); + void testReplaceControlCharactersWithMultipleControlChars() { + String input = "\u0001\u0002A\nB\u0003"; // \u0001, \u0002, \u0003 are control chars + String result = Tokenizer.replaceControlCharacters(input); + + assertEquals("\\u0001\\u0002A\nB\\u0003", result); } - //Edge cases @Test - void testEncodeOrdinaryWithEmptyString() { - List result = tokenizer.encodeOrdinary(""); - assertNotNull(result, "Result should not be null for empty input"); - assertTrue(result.isEmpty(), "Result should be empty for empty input"); + void testReplaceControlCharactersEmptyInput() { + String input = ""; + String result = Tokenizer.replaceControlCharacters(input); + + assertEquals("", result); } @Test - void testEncodeOrdinaryWithNull() { - assertThrows(NullPointerException.class, () -> tokenizer.encodeOrdinary(null)); + void testReplaceControlCharactersNullSafe() { + // Add this test if you plan to make it null-safe. + assertThrows(NullPointerException.class, () -> { + Tokenizer.replaceControlCharacters((String) null); + }); } } \ No newline at end of file