CCExtractor · steel-bucket · Jul 13, 2025 · Jul 13, 2025 · Jul 13, 2025
diff --git a/docs/CHANGES.TXT b/docs/CHANGES.TXT
@@ -1,5 +1,6 @@
 1.0 (to be released)
 -----------------
+- New: Add Encoder Module to Rust
 - Fix: Segmentation faults on XDS files
 - Fix: Clippy Errors Based on Rust 1.88
 - IMPROVEMENT: Refactor and optimize Dockerfile

diff --git a/src/lib_ccx/ccx_encoders_common.c b/src/lib_ccx/ccx_encoders_common.c
@@ -19,6 +19,10 @@ int fsync(int fd)
 }
 #endif
 
+#ifndef DISABLE_RUST
+int ccxr_get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs,
+		       enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len);
+#endif
 // These are the default settings for plain transcripts. No times, no CC or caption mode, and no XDS.
 ccx_encoders_transcript_format ccx_encoders_default_transcript_settings =
     {
@@ -293,6 +297,9 @@ int change_ascii_encoding(unsigned char *dest, unsigned char *src, int len, enum
 int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs,
 		  enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len)
 {
+#ifndef DISABLE_RUST
+	return ccxr_get_str_basic(out_buffer, in_buffer, trim_subs, in_enc, out_enc, max_len);
+#else
 	int last_non_blank = -1;
 	int first_non_blank = -1;
 	int len = 0;
@@ -305,7 +312,6 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_
 		*out_buffer = 0;
 		return 0;
 	}
-
 	// change encoding only when required
 	switch (in_enc)
 	{
@@ -331,6 +337,7 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_
 		return (unsigned)len; // Return length
 
 	return 0; // Return length
+#endif
 }
 
 int write_subtitle_file_footer(struct encoder_ctx *ctx, struct ccx_s_write *out)

diff --git a/src/lib_ccx/ccx_encoders_spupng.c b/src/lib_ccx/ccx_encoders_spupng.c
@@ -29,18 +29,6 @@ FT_Face face_regular = NULL;
 FT_Face face_italics = NULL;
 FT_Face face = NULL;
 
-struct spupng_t
-{
-	FILE *fpxml;
-	FILE *fppng;
-	char *dirname;
-	char *pngfile;
-	char *relative_path_png;
-	int fileIndex;
-	int xOffset;
-	int yOffset;
-};
-
 #define CCPL (ccfont2_width / CCW * ccfont2_height / CCH)
 
 static int initialized = 0;

diff --git a/src/lib_ccx/ccx_encoders_structs.h b/src/lib_ccx/ccx_encoders_structs.h
@@ -29,4 +29,16 @@ struct ccx_s_write
 
 };
 
+struct spupng_t
+{
+	FILE *fpxml;
+	FILE *fppng;
+	char *dirname;
+	char *pngfile;
+	char *relative_path_png;
+	int fileIndex;
+	int xOffset;
+	int yOffset;
+};
+
 #endif
diff --git a/src/rust/build.rs b/src/rust/build.rs
@@ -12,6 +12,9 @@ fn main() {
         "writercwtdata",
         "version",
         "set_binary_mode",
+        "net_send_header", // shall be removed after NET
+        "write_spumux_footer",
+        "write_spumux_header",
     ]);
 
     #[cfg(feature = "hardsubx_ocr")]
@@ -39,6 +42,7 @@ fn main() {
         "ccx_encoding_type",
         "ccx_decoder_608_settings",
         "ccx_decoder_608_report",
+        "eia608_screen",
         "uint8_t",
         "word_list",
     ]);

diff --git a/src/rust/lib_ccxr/src/util/encoding.rs b/src/rust/lib_ccxr/src/util/encoding.rs
@@ -56,6 +56,8 @@ pub type Latin1Char = u8;
 
 /// Represents a character in UCS-2 encoding.
 pub type Ucs2Char = u16;
+/// Represents a character in UTF-8 encoding.
+pub type Utf8Char = u32;
 
 /// A String-like type containing a sequence of Line 21 encoded characters.
 #[derive(Clone, Debug, Eq, PartialEq, Default)]
@@ -339,7 +341,7 @@ impl From<&str> for Ucs2String {
 
 impl From<&Line21String> for String {
     fn from(value: &Line21String) -> String {
-        value.as_vec().iter().map(|&c| line21_to_utf8(c)).collect()
+        value.as_vec().iter().map(|&c| line21_to_char(c)).collect()
     }
 }
 
@@ -480,7 +482,7 @@ impl EncodedString {
     }
 
     /// Converts this [`EncodedString`] to a format provided by `encoding`, returning a new [`EncodedString`].
-    ///    
+    ///
     /// # Examples
     /// ```rust
     /// # use lib_ccxr::util::encoding::*;
@@ -653,7 +655,7 @@ fn latin1_to_line21(c: Latin1Char) -> Line21Char {
     }
 }
 
-fn line21_to_latin1(c: Line21Char) -> Latin1Char {
+pub fn line21_to_latin1(c: Line21Char) -> Latin1Char {
     if c < 0x80 {
         // Regular line-21 character set, mostly ASCII except these exceptions
         match c {
@@ -764,118 +766,143 @@ fn line21_to_latin1(c: Line21Char) -> Latin1Char {
     }
 }
 
-fn line21_to_utf8(c: Line21Char) -> char {
+pub fn line21_to_utf8(c: Line21Char) -> (u32, usize) {
     if c < 0x80 {
         // Regular line-21 character set, mostly ASCII except these exceptions
         match c {
-            0x2a => 0xe1 as char, // lowercase a, acute accent
-            0x5c => 0xe9 as char, // lowercase e, acute accent
-            0x5e => 0xed as char, // lowercase i, acute accent
-            0x5f => 0xf3 as char, // lowercase o, acute accent
-            0x60 => 0xfa as char, // lowercase u, acute accent
-            0x7b => 0xe7 as char, // lowercase c with cedilla
-            0x7c => 0xf7 as char, // division symbol
-            0x7d => 0xd1 as char, // uppercase N tilde
-            0x7e => 0xf1 as char, // lowercase n tilde
-            0x7f => '■',          // Solid block
-            _ => c as char,
+            0x2a => (0xc3a1, 2),   // lowercase a, acute accent
+            0x5c => (0xc3a9, 2),   // lowercase e, acute accent
+            0x5e => (0xc3ad, 2),   // lowercase i, acute accent
+            0x5f => (0xc3b3, 2),   // lowercase o, acute accent
+            0x60 => (0xc3ba, 2),   // lowercase u, acute accent
+            0x7b => (0xc3a7, 2),   // lowercase c with cedilla
+            0x7c => (0xc3b7, 2),   // division symbol
+            0x7d => (0xc391, 2),   // uppercase N tilde
+            0x7e => (0xc3b1, 2),   // lowercase n tilde
+            0x7f => (0xe296a0, 3), // Solid block
+            _ => (c as u32, 1),    // Default: regular ASCII
         }
     } else {
         match c {
             // THIS BLOCK INCLUDES THE 16 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS
             // THAT COME FROM HI BYTE=0x11 AND LOW BETWEEN 0x30 AND 0x3F
-            0x80 => 0xae as char, // Registered symbol (R)
-            0x81 => 0xb0 as char, // degree sign
-            0x82 => 0xbd as char, // 1/2 symbol
-            0x83 => 0xbf as char, // Inverted (open) question mark
-            0x84 => '™',          // Trademark symbol (TM)
-            0x85 => 0xa2 as char, // Cents symbol
-            0x86 => 0xa3 as char, // Pounds sterling
-            0x87 => 0xb6 as char, // Music note - Not in latin 1, so we use 'pilcrow'
-            0x88 => 0xe0 as char, // lowercase a, grave accent
-            0x89 => 0x20 as char, // transparent space, we make it regular
-            0x8a => 0xe8 as char, // lowercase e, grave accent
-            0x8b => 0xe2 as char, // lowercase a, circumflex accent
-            0x8c => 0xea as char, // lowercase e, circumflex accent
-            0x8d => 0xee as char, // lowercase i, circumflex accent
-            0x8e => 0xf4 as char, // lowercase o, circumflex accent
-            0x8f => 0xfb as char, // lowercase u, circumflex accent
+            0x80 => (0xc2ae, 2),   // Registered symbol (R)
+            0x81 => (0xc2b0, 2),   // degree sign
+            0x82 => (0xc2bd, 2),   // 1/2 symbol
+            0x83 => (0xc2bf, 2),   // Inverted (open) question mark
+            0x84 => (0xe284a2, 3), // Trademark symbol (TM)
+            0x85 => (0xc2a2, 2),   // Cents symbol
+            0x86 => (0xc2a3, 2),   // Pounds sterling
+            0x87 => (0xe299aa, 3), // Music note
+            0x88 => (0xc3a0, 2),   // lowercase a, grave accent
+            0x89 => (0x20, 1),     // transparent space, we make it regular
+            0x8a => (0xc3a8, 2),   // lowercase e, grave accent
+            0x8b => (0xc3a2, 2),   // lowercase a, circumflex accent
+            0x8c => (0xc3aa, 2),   // lowercase e, circumflex accent
+            0x8d => (0xc3ae, 2),   // lowercase i, circumflex accent
+            0x8e => (0xc3b4, 2),   // lowercase o, circumflex accent
+            0x8f => (0xc3bb, 2),   // lowercase u, circumflex accent
+
             // THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS
             // THAT COME FROM HI BYTE=0x12 AND LOW BETWEEN 0x20 AND 0x3F
-            0x90 => 0xc1 as char, // capital letter A with acute
-            0x91 => 0xc9 as char, // capital letter E with acute
-            0x92 => 0xd3 as char, // capital letter O with acute
-            0x93 => 0xda as char, // capital letter U with acute
-            0x94 => 0xdc as char, // capital letter U with diaeresis
-            0x95 => 0xfc as char, // lowercase letter U with diaeresis
-            0x96 => 0x27 as char, // apostrophe
-            0x97 => 0xa1 as char, // inverted exclamation mark
-            0x98 => 0x2a as char, // asterisk
-            0x99 => 0x27 as char, // apostrophe (yes, duped). See CCADI source code.
-            0x9a => 0x2d as char, // em dash
-            0x9b => 0xa9 as char, // copyright sign
-            0x9c => '℠',          // Service Mark
-            0x9d => 0x2e as char, // Full stop (.)
-            0x9e => 0x22 as char, // Quotation mark
-            0x9f => 0x22 as char, // Quotation mark
-            0xa0 => 0xc0 as char, // uppercase A, grave accent
-            0xa1 => 0xc2 as char, // uppercase A, circumflex
-            0xa2 => 0xc7 as char, // uppercase C with cedilla
-            0xa3 => 0xc8 as char, // uppercase E, grave accent
-            0xa4 => 0xca as char, // uppercase E, circumflex
-            0xa5 => 0xcb as char, // capital letter E with diaeresis
-            0xa6 => 0xeb as char, // lowercase letter e with diaeresis
-            0xa7 => 0xce as char, // uppercase I, circumflex
-            0xa8 => 0xcf as char, // uppercase I, with diaeresis
-            0xa9 => 0xef as char, // lowercase i, with diaeresis
-            0xaa => 0xd4 as char, // uppercase O, circumflex
-            0xab => 0xd9 as char, // uppercase U, grave accent
-            0xac => 0xf9 as char, // lowercase u, grave accent
-            0xad => 0xdb as char, // uppercase U, circumflex
-            0xae => 0xab as char, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
-            0xaf => 0xbb as char, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+            0x90 => (0xc381, 2),   // capital letter A with acute
+            0x91 => (0xc389, 2),   // capital letter E with acute
+            0x92 => (0xc393, 2),   // capital letter O with acute
+            0x93 => (0xc39a, 2),   // capital letter U with acute
+            0x94 => (0xc39c, 2),   // capital letter U with diaeresis
+            0x95 => (0xc3bc, 2),   // lowercase letter U with diaeresis
+            0x96 => (0x27, 1),     // apostrophe
+            0x97 => (0xc2a1, 2),   // inverted exclamation mark
+            0x98 => (0x2a, 1),     // asterisk
+            0x99 => (0x27, 1),     // Plain single quote
+            0x9a => (0xe28094, 3), // em dash
+            0x9b => (0xc2a9, 2),   // copyright sign
+            0x9c => (0xe284a0, 3), // Service mark
+            0x9d => (0xe280a2, 3), // Round bullet
+            0x9e => (0xe2809c, 3), // Opening double quotes
+            0x9f => (0xe2809d, 3), // Closing double quotes
+            0xa0 => (0xc380, 2),   // uppercase A, grave accent
+            0xa1 => (0xc382, 2),   // uppercase A, circumflex
+            0xa2 => (0xc387, 2),   // uppercase C with cedilla
+            0xa3 => (0xc388, 2),   // uppercase E, grave accent
+            0xa4 => (0xc38a, 2),   // uppercase E, circumflex
+            0xa5 => (0xc38b, 2),   // capital letter E with diaeresis
+            0xa6 => (0xc3ab, 2),   // lowercase letter e with diaeresis
+            0xa7 => (0xc38e, 2),   // uppercase I, circumflex
+            0xa8 => (0xc38f, 2),   // uppercase I, with diaeresis
+            0xa9 => (0xc3af, 2),   // lowercase i, with diaeresis
+            0xaa => (0xc394, 2),   // uppercase O, circumflex
+            0xab => (0xc399, 2),   // uppercase U, grave accent
+            0xac => (0xc3b9, 2),   // lowercase u, grave accent
+            0xad => (0xc39b, 2),   // uppercase U, circumflex
+            0xae => (0xc2ab, 2),   // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+            0xaf => (0xc2bb, 2),   // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+
             // THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS
             // THAT COME FROM HI BYTE=0x13 AND LOW BETWEEN 0x20 AND 0x3F
-            0xb0 => 0xc3 as char, // Uppercase A, tilde
-            0xb1 => 0xe3 as char, // Lowercase a, tilde
-            0xb2 => 0xcd as char, // Uppercase I, acute accent
-            0xb3 => 0xcc as char, // Uppercase I, grave accent
-            0xb4 => 0xec as char, // Lowercase i, grave accent
-            0xb5 => 0xd2 as char, // Uppercase O, grave accent
-            0xb6 => 0xf2 as char, // Lowercase o, grave accent
-            0xb7 => 0xd5 as char, // Uppercase O, tilde
-            0xb8 => 0xf5 as char, // Lowercase o, tilde
-            0xb9 => 0x7b as char, // Open curly brace
-            0xba => 0x7d as char, // Closing curly brace
-            0xbb => 0x5c as char, // Backslash
-            0xbc => 0x5e as char, // Caret
-            0xbd => 0x5f as char, // Underscore
-            0xbe => 0xa6 as char, // Pipe (broken bar)
-            0xbf => 0x7e as char, // Tilde
-            0xc0 => 0xc4 as char, // Uppercase A, umlaut
-            0xc1 => 0xe3 as char, // Lowercase A, umlaut
-            0xc2 => 0xd6 as char, // Uppercase O, umlaut
-            0xc3 => 0xf6 as char, // Lowercase o, umlaut
-            0xc4 => 0xdf as char, // Eszett (sharp S)
-            0xc5 => 0xa5 as char, // Yen symbol
-            0xc6 => 0xa4 as char, // Currency symbol
-            0xc7 => 0x7c as char, // Vertical bar
-            0xc8 => 0xc5 as char, // Uppercase A, ring
-            0xc9 => 0xe5 as char, // Lowercase A, ring
-            0xca => 0xd8 as char, // Uppercase O, slash
-            0xcb => 0xf8 as char, // Lowercase o, slash
-            0xcc => '⌜',          // Top left corner
-            0xcd => '⌝',          // Top right corner
-            0xce => '⌞',          // Bottom left corner
-            0xcf => '⌟',          // Bottom right corner
-            _ => UNAVAILABLE_CHAR as char, // For those that don't have representation
-                                   // I'll do it eventually, I promise
-                                   // This are weird chars anyway
+            0xb0 => (0xc383, 2),   // Uppercase A, tilde
+            0xb1 => (0xc3a3, 2),   // Lowercase a, tilde
+            0xb2 => (0xc38d, 2),   // Uppercase I, acute accent
+            0xb3 => (0xc38c, 2),   // Uppercase I, grave accent
+            0xb4 => (0xc3ac, 2),   // Lowercase i, grave accent
+            0xb5 => (0xc392, 2),   // Uppercase O, grave accent
+            0xb6 => (0xc3b2, 2),   // Lowercase o, grave accent
+            0xb7 => (0xc395, 2),   // Uppercase O, tilde
+            0xb8 => (0xc3b5, 2),   // Lowercase o, tilde
+            0xb9 => (0x7b, 1),     // Open curly brace
+            0xba => (0x7d, 1),     // Closing curly brace
+            0xbb => (0x5c, 1),     // Backslash
+            0xbc => (0x5e, 1),     // Caret
+            0xbd => (0x5f, 1),     // Underscore
+            0xbe => (0xc2a6, 2),   // Pipe (broken bar)
+            0xbf => (0x7e, 1),     // Tilde
+            0xc0 => (0xc384, 2),   // Uppercase A, umlaut
+            0xc1 => (0xc3a4, 2),   // Lowercase A, umlaut
+            0xc2 => (0xc396, 2),   // Uppercase O, umlaut
+            0xc3 => (0xc3b6, 2),   // Lowercase o, umlaut
+            0xc4 => (0xc39f, 2),   // Esszett (sharp S)
+            0xc5 => (0xc2a5, 2),   // Yen symbol
+            0xc6 => (0xc2a4, 2),   // Currency symbol
+            0xc7 => (0x7c, 1),     // Vertical bar
+            0xc8 => (0xc385, 2),   // Uppercase A, ring
+            0xc9 => (0xc3a5, 2),   // Lowercase A, ring
+            0xca => (0xc398, 2),   // Uppercase O, slash
+            0xcb => (0xc3b8, 2),   // Lowercase o, slash
+            0xcc => (0xe28c9c, 3), // Top left corner
+            0xcd => (0xe28c9d, 3), // Top right corner
+            0xce => (0xe28c9e, 3), // Bottom left corner
+            0xcf => (0xe28c9f, 3), // Bottom right corner
+            _ => (b'?' as u32, 1), // I'll do it eventually, I promise
+                                    // This are weird chars anyway
         }
     }
 }
+pub fn line21_to_char(c: Line21Char) -> char {
+    let (utf8_packed, byte_count) = line21_to_utf8(c);
+
+    // Extract bytes and create a UTF-8 string
+    let mut bytes = Vec::new();
+    match byte_count {
+        1 => bytes.push(utf8_packed as u8),
+        2 => {
+            bytes.push((utf8_packed >> 8) as u8);
+            bytes.push(utf8_packed as u8);
+        }
+        3 => {
+            bytes.push((utf8_packed >> 16) as u8);
+            bytes.push((utf8_packed >> 8) as u8);
+            bytes.push(utf8_packed as u8);
+        }
+        _ => return '?', // Invalid byte count
+    }
 
-fn line21_to_ucs2(c: Line21Char) -> Ucs2Char {
+    // Convert UTF-8 bytes to char
+    match std::str::from_utf8(&bytes) {
+        Ok(s) => s.chars().next().unwrap_or('?'),
+        Err(_) => '?',
+    }
+}
+pub fn line21_to_ucs2(c: Line21Char) -> Ucs2Char {
     match c {
         0x7f => 0x25A0,                  // Solid block
         0x84 => 0x2122,                  // Trademark symbol (TM)
@@ -905,7 +932,7 @@ fn ucs2_to_line21(c: Ucs2Char) -> Line21Char {
     }
 }
 
-fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char {
+pub fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char {
     // Code points 0 to U+00FF are the same in both.
     if c < 0xff {
         c as u8
@@ -1004,6 +1031,6 @@ fn ucs2_to_char(c: Ucs2Char) -> char {
     char::from_u32(x).unwrap_or(UNAVAILABLE_CHAR.into())
 }
 
-fn char_to_ucs2(c: char) -> Ucs2Char {
+pub fn char_to_ucs2(c: char) -> Ucs2Char {
     (c as u32).try_into().unwrap_or(UNAVAILABLE_CHAR.into())
 }