diff --git a/README.md b/README.md index 5e24b2d..db86f9b 100644 --- a/README.md +++ b/README.md @@ -464,6 +464,34 @@ Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/TORNAD Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/GPULlama3_ROADMAP.md) to see the roadmap of the project. + +## Run All Tests + +You can run all unit tests using the following Maven command: + +```bash + mvn test + +Sample Output +------------------------------------------------------- + T E S T S +------------------------------------------------------- +Running com.example.tokenizer.impl.MistralTokenizerTest +Running com.example.tokenizer.impl.TokenizerInterfaceTest + +Tests run: 12, Failures: 0, Errors: 0, Skipped: 0 + +To run tests inside an IDE (e.g., IntelliJ), right-click on the test classes and choose Run. + +## Test Coverage +Here are the tokenizer unit tests included: + +| **Test Class** | **Description** | +|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------| +| `MistralTokenizerTest` | Verifies Mistral tokenizer functionality including byte fallback (`<0xXX>`), special token handling, encoding and decoding logic | +| `TokenizerInterfaceTest` | Unit tests for utility methods like `replaceControlCharacters`, ensuring printable and safe token rendering | + +``` ----------- ## Acknowledgments diff --git a/pom.xml b/pom.xml index 216dda9..0908f89 100644 --- a/pom.xml +++ b/pom.xml @@ -32,6 +32,12 @@ tornado-runtime 1.1.1-dev + + org.junit.jupiter + junit-jupiter + 5.10.2 + test + @@ -68,6 +74,11 @@ + + org.apache.maven.plugins + maven-surefire-plugin + 3.2.5 + diff --git a/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java b/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java new file mode 100644 index 0000000..81e365e --- /dev/null +++ b/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java @@ -0,0 +1,80 @@ +package com.example.tokenizer.impl; + +import com.example.tokenizer.vocabulary.Vocabulary; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.*; + +import static org.junit.jupiter.api.Assertions.*; + +class MistralTokenizerTest { + + private Vocabulary vocabulary; + private MistralTokenizer tokenizer; + + @BeforeEach + void setup() { + List baseTokens = List.of("▁h", "e", "l", "o", "▁", "▁hello"); + List byteFallbackTokens = new ArrayList<>(); + + for (int i = 0; i < 256; i++) { + byteFallbackTokens.add(String.format("<0x%02X>", i)); + } + + List allTokens = new ArrayList<>(); + allTokens.addAll(baseTokens); + allTokens.addAll(byteFallbackTokens); + + String[] tokens = allTokens.toArray(new String[0]); + float[] scores = new float[tokens.length]; + Arrays.fill(scores, 0.0f); // dummy scores + + int[] tokenTypes = new int[tokens.length]; + Arrays.fill(tokenTypes, 1); // mark all normal + tokenTypes[baseTokens.size()] = 0; // mark <0x00> as special + + Map metadata = new HashMap<>(); + metadata.put("tokenizer.ggml.token_type", tokenTypes); + + vocabulary = new Vocabulary(tokens, scores); + tokenizer = new MistralTokenizer(metadata, vocabulary); + } + + @Test + void testEncodeSimpleText() { + List tokens = tokenizer.encodeAsList("hello"); + assertNotNull(tokens); + assertFalse(tokens.isEmpty()); + } + + @Test + void testRegexPatternReturnsNull() { + assertNull(tokenizer.regexPattern()); + } + + @Test + void testSpecialTokenDetection() { + assertTrue(tokenizer.isSpecialToken(6)); + assertFalse(tokenizer.isSpecialToken(0)); + } + + @Test + void testShouldDisplayToken() { + assertTrue(tokenizer.shouldDisplayToken(0)); + assertFalse(tokenizer.shouldDisplayToken(6)); + } + + @Test + void testDecodeSpecialByteFallbackToken() { + List tokens = List.of(6); // token <0x00> + String result = tokenizer.decode(tokens); + assertEquals("\u0000", result); // ASCII for <0x00> + } + + @Test + void testEncodeEmptyInput() { + List tokens = tokenizer.encodeAsList(""); + assertTrue(tokens.isEmpty(), "Should return empty token list for empty input"); + } +} \ No newline at end of file diff --git a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java new file mode 100644 index 0000000..dce2e46 --- /dev/null +++ b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java @@ -0,0 +1,56 @@ +package com.example.tokenizer.impl; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class TokenizerInterfaceTest { + + @Test + void testReplaceControlCharactersWithCodePoints() { + int[] input = {'H', 'e', '\n', 0x07, 'l', 'o'}; // 0x07 = BEL (control character) + String result = Tokenizer.replaceControlCharacters(input); + + assertEquals("He\n\\u0007lo", result); // \n allowed, BEL escaped + } + + @Test + void testReplaceControlCharactersWithString() { + String input = "He\n\u0007lo"; // \u0007 is a bell character (non-printable control char) + String result = Tokenizer.replaceControlCharacters(input); + + assertEquals("He\n\\u0007lo", result); + } + + @Test + void testReplaceControlCharactersWithOnlyPrintableChars() { + String input = "Hello, World!"; + String result = Tokenizer.replaceControlCharacters(input); + + assertEquals(input, result); + } + + @Test + void testReplaceControlCharactersWithMultipleControlChars() { + String input = "\u0001\u0002A\nB\u0003"; // \u0001, \u0002, \u0003 are control chars + String result = Tokenizer.replaceControlCharacters(input); + + assertEquals("\\u0001\\u0002A\nB\\u0003", result); + } + + @Test + void testReplaceControlCharactersEmptyInput() { + String input = ""; + String result = Tokenizer.replaceControlCharacters(input); + + assertEquals("", result); + } + + @Test + void testReplaceControlCharactersNullSafe() { + // Add this test if you plan to make it null-safe. + assertThrows(NullPointerException.class, () -> { + Tokenizer.replaceControlCharacters((String) null); + }); + } +} \ No newline at end of file