diff --git a/README.md b/README.md
index 5e24b2d..db86f9b 100644
--- a/README.md
+++ b/README.md
@@ -464,6 +464,34 @@ Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/TORNAD
Click [here](https://github.com/beehive-lab/GPULlama3.java/tree/main/docs/GPULlama3_ROADMAP.md) to see the roadmap of the project.
+
+## Run All Tests
+
+You can run all unit tests using the following Maven command:
+
+```bash
+ mvn test
+
+Sample Output
+-------------------------------------------------------
+ T E S T S
+-------------------------------------------------------
+Running com.example.tokenizer.impl.MistralTokenizerTest
+Running com.example.tokenizer.impl.TokenizerInterfaceTest
+
+Tests run: 12, Failures: 0, Errors: 0, Skipped: 0
+
+To run tests inside an IDE (e.g., IntelliJ), right-click on the test classes and choose Run.
+
+## Test Coverage
+Here are the tokenizer unit tests included:
+
+| **Test Class** | **Description** |
+|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|
+| `MistralTokenizerTest` | Verifies Mistral tokenizer functionality including byte fallback (`<0xXX>`), special token handling, encoding and decoding logic |
+| `TokenizerInterfaceTest` | Unit tests for utility methods like `replaceControlCharacters`, ensuring printable and safe token rendering |
+
+```
-----------
## Acknowledgments
diff --git a/pom.xml b/pom.xml
index 216dda9..0908f89 100644
--- a/pom.xml
+++ b/pom.xml
@@ -32,6 +32,12 @@
tornado-runtime
1.1.1-dev
+
+ org.junit.jupiter
+ junit-jupiter
+ 5.10.2
+ test
+
@@ -68,6 +74,11 @@
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.2.5
+
diff --git a/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java b/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java
new file mode 100644
index 0000000..81e365e
--- /dev/null
+++ b/src/test/java/com/example/tokenizer/impl/MistralTokenizerTest.java
@@ -0,0 +1,80 @@
+package com.example.tokenizer.impl;
+
+import com.example.tokenizer.vocabulary.Vocabulary;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.util.*;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class MistralTokenizerTest {
+
+ private Vocabulary vocabulary;
+ private MistralTokenizer tokenizer;
+
+ @BeforeEach
+ void setup() {
+ List baseTokens = List.of("▁h", "e", "l", "o", "▁", "▁hello");
+ List byteFallbackTokens = new ArrayList<>();
+
+ for (int i = 0; i < 256; i++) {
+ byteFallbackTokens.add(String.format("<0x%02X>", i));
+ }
+
+ List allTokens = new ArrayList<>();
+ allTokens.addAll(baseTokens);
+ allTokens.addAll(byteFallbackTokens);
+
+ String[] tokens = allTokens.toArray(new String[0]);
+ float[] scores = new float[tokens.length];
+ Arrays.fill(scores, 0.0f); // dummy scores
+
+ int[] tokenTypes = new int[tokens.length];
+ Arrays.fill(tokenTypes, 1); // mark all normal
+ tokenTypes[baseTokens.size()] = 0; // mark <0x00> as special
+
+ Map metadata = new HashMap<>();
+ metadata.put("tokenizer.ggml.token_type", tokenTypes);
+
+ vocabulary = new Vocabulary(tokens, scores);
+ tokenizer = new MistralTokenizer(metadata, vocabulary);
+ }
+
+ @Test
+ void testEncodeSimpleText() {
+ List tokens = tokenizer.encodeAsList("hello");
+ assertNotNull(tokens);
+ assertFalse(tokens.isEmpty());
+ }
+
+ @Test
+ void testRegexPatternReturnsNull() {
+ assertNull(tokenizer.regexPattern());
+ }
+
+ @Test
+ void testSpecialTokenDetection() {
+ assertTrue(tokenizer.isSpecialToken(6));
+ assertFalse(tokenizer.isSpecialToken(0));
+ }
+
+ @Test
+ void testShouldDisplayToken() {
+ assertTrue(tokenizer.shouldDisplayToken(0));
+ assertFalse(tokenizer.shouldDisplayToken(6));
+ }
+
+ @Test
+ void testDecodeSpecialByteFallbackToken() {
+ List tokens = List.of(6); // token <0x00>
+ String result = tokenizer.decode(tokens);
+ assertEquals("\u0000", result); // ASCII for <0x00>
+ }
+
+ @Test
+ void testEncodeEmptyInput() {
+ List tokens = tokenizer.encodeAsList("");
+ assertTrue(tokens.isEmpty(), "Should return empty token list for empty input");
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/com/example/tokenizer/impl/TokenizerTest.java b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java
new file mode 100644
index 0000000..dce2e46
--- /dev/null
+++ b/src/test/java/com/example/tokenizer/impl/TokenizerTest.java
@@ -0,0 +1,56 @@
+package com.example.tokenizer.impl;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class TokenizerInterfaceTest {
+
+ @Test
+ void testReplaceControlCharactersWithCodePoints() {
+ int[] input = {'H', 'e', '\n', 0x07, 'l', 'o'}; // 0x07 = BEL (control character)
+ String result = Tokenizer.replaceControlCharacters(input);
+
+ assertEquals("He\n\\u0007lo", result); // \n allowed, BEL escaped
+ }
+
+ @Test
+ void testReplaceControlCharactersWithString() {
+ String input = "He\n\u0007lo"; // \u0007 is a bell character (non-printable control char)
+ String result = Tokenizer.replaceControlCharacters(input);
+
+ assertEquals("He\n\\u0007lo", result);
+ }
+
+ @Test
+ void testReplaceControlCharactersWithOnlyPrintableChars() {
+ String input = "Hello, World!";
+ String result = Tokenizer.replaceControlCharacters(input);
+
+ assertEquals(input, result);
+ }
+
+ @Test
+ void testReplaceControlCharactersWithMultipleControlChars() {
+ String input = "\u0001\u0002A\nB\u0003"; // \u0001, \u0002, \u0003 are control chars
+ String result = Tokenizer.replaceControlCharacters(input);
+
+ assertEquals("\\u0001\\u0002A\nB\\u0003", result);
+ }
+
+ @Test
+ void testReplaceControlCharactersEmptyInput() {
+ String input = "";
+ String result = Tokenizer.replaceControlCharacters(input);
+
+ assertEquals("", result);
+ }
+
+ @Test
+ void testReplaceControlCharactersNullSafe() {
+ // Add this test if you plan to make it null-safe.
+ assertThrows(NullPointerException.class, () -> {
+ Tokenizer.replaceControlCharacters((String) null);
+ });
+ }
+}
\ No newline at end of file