diff --git a/docs/CHANGES.TXT b/docs/CHANGES.TXT index e9dfe614c..84dd7c5d4 100644 --- a/docs/CHANGES.TXT +++ b/docs/CHANGES.TXT @@ -1,5 +1,6 @@ 1.0 (to be released) ----------------- +- New: Add Encoder Module to Rust - Fix: Segmentation faults on XDS files - Fix: Clippy Errors Based on Rust 1.88 - IMPROVEMENT: Refactor and optimize Dockerfile diff --git a/src/lib_ccx/ccx_encoders_common.c b/src/lib_ccx/ccx_encoders_common.c index c2540dd10..7ca6ce30f 100644 --- a/src/lib_ccx/ccx_encoders_common.c +++ b/src/lib_ccx/ccx_encoders_common.c @@ -19,6 +19,10 @@ int fsync(int fd) } #endif +#ifndef DISABLE_RUST +int ccxr_get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs, + enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len); +#endif // These are the default settings for plain transcripts. No times, no CC or caption mode, and no XDS. ccx_encoders_transcript_format ccx_encoders_default_transcript_settings = { @@ -293,6 +297,9 @@ int change_ascii_encoding(unsigned char *dest, unsigned char *src, int len, enum int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs, enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len) { +#ifndef DISABLE_RUST + return ccxr_get_str_basic(out_buffer, in_buffer, trim_subs, in_enc, out_enc, max_len); +#else int last_non_blank = -1; int first_non_blank = -1; int len = 0; @@ -305,7 +312,6 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_ *out_buffer = 0; return 0; } - // change encoding only when required switch (in_enc) { @@ -331,6 +337,7 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_ return (unsigned)len; // Return length return 0; // Return length +#endif } int write_subtitle_file_footer(struct encoder_ctx *ctx, struct ccx_s_write *out) diff --git a/src/lib_ccx/ccx_encoders_spupng.c b/src/lib_ccx/ccx_encoders_spupng.c index 815c521ed..17da7803b 100644 --- a/src/lib_ccx/ccx_encoders_spupng.c +++ b/src/lib_ccx/ccx_encoders_spupng.c @@ -29,18 +29,6 @@ FT_Face face_regular = NULL; FT_Face face_italics = NULL; FT_Face face = NULL; -struct spupng_t -{ - FILE *fpxml; - FILE *fppng; - char *dirname; - char *pngfile; - char *relative_path_png; - int fileIndex; - int xOffset; - int yOffset; -}; - #define CCPL (ccfont2_width / CCW * ccfont2_height / CCH) static int initialized = 0; diff --git a/src/lib_ccx/ccx_encoders_structs.h b/src/lib_ccx/ccx_encoders_structs.h index 6c4d3cdb6..83adc509c 100644 --- a/src/lib_ccx/ccx_encoders_structs.h +++ b/src/lib_ccx/ccx_encoders_structs.h @@ -29,4 +29,16 @@ struct ccx_s_write }; +struct spupng_t +{ + FILE *fpxml; + FILE *fppng; + char *dirname; + char *pngfile; + char *relative_path_png; + int fileIndex; + int xOffset; + int yOffset; +}; + #endif diff --git a/src/rust/build.rs b/src/rust/build.rs index 482694e86..a2005dd65 100644 --- a/src/rust/build.rs +++ b/src/rust/build.rs @@ -12,6 +12,9 @@ fn main() { "writercwtdata", "version", "set_binary_mode", + "net_send_header", // shall be removed after NET + "write_spumux_footer", + "write_spumux_header", ]); #[cfg(feature = "hardsubx_ocr")] @@ -39,6 +42,7 @@ fn main() { "ccx_encoding_type", "ccx_decoder_608_settings", "ccx_decoder_608_report", + "eia608_screen", "uint8_t", "word_list", ]); diff --git a/src/rust/lib_ccxr/src/util/encoding.rs b/src/rust/lib_ccxr/src/util/encoding.rs index 6f0a77a35..06f5eaa05 100644 --- a/src/rust/lib_ccxr/src/util/encoding.rs +++ b/src/rust/lib_ccxr/src/util/encoding.rs @@ -56,6 +56,8 @@ pub type Latin1Char = u8; /// Represents a character in UCS-2 encoding. pub type Ucs2Char = u16; +/// Represents a character in UTF-8 encoding. +pub type Utf8Char = u32; /// A String-like type containing a sequence of Line 21 encoded characters. #[derive(Clone, Debug, Eq, PartialEq, Default)] @@ -339,7 +341,7 @@ impl From<&str> for Ucs2String { impl From<&Line21String> for String { fn from(value: &Line21String) -> String { - value.as_vec().iter().map(|&c| line21_to_utf8(c)).collect() + value.as_vec().iter().map(|&c| line21_to_char(c)).collect() } } @@ -480,7 +482,7 @@ impl EncodedString { } /// Converts this [`EncodedString`] to a format provided by `encoding`, returning a new [`EncodedString`]. - /// + /// /// # Examples /// ```rust /// # use lib_ccxr::util::encoding::*; @@ -653,7 +655,7 @@ fn latin1_to_line21(c: Latin1Char) -> Line21Char { } } -fn line21_to_latin1(c: Line21Char) -> Latin1Char { +pub fn line21_to_latin1(c: Line21Char) -> Latin1Char { if c < 0x80 { // Regular line-21 character set, mostly ASCII except these exceptions match c { @@ -764,118 +766,143 @@ fn line21_to_latin1(c: Line21Char) -> Latin1Char { } } -fn line21_to_utf8(c: Line21Char) -> char { +pub fn line21_to_utf8(c: Line21Char) -> (u32, usize) { if c < 0x80 { // Regular line-21 character set, mostly ASCII except these exceptions match c { - 0x2a => 0xe1 as char, // lowercase a, acute accent - 0x5c => 0xe9 as char, // lowercase e, acute accent - 0x5e => 0xed as char, // lowercase i, acute accent - 0x5f => 0xf3 as char, // lowercase o, acute accent - 0x60 => 0xfa as char, // lowercase u, acute accent - 0x7b => 0xe7 as char, // lowercase c with cedilla - 0x7c => 0xf7 as char, // division symbol - 0x7d => 0xd1 as char, // uppercase N tilde - 0x7e => 0xf1 as char, // lowercase n tilde - 0x7f => '■', // Solid block - _ => c as char, + 0x2a => (0xc3a1, 2), // lowercase a, acute accent + 0x5c => (0xc3a9, 2), // lowercase e, acute accent + 0x5e => (0xc3ad, 2), // lowercase i, acute accent + 0x5f => (0xc3b3, 2), // lowercase o, acute accent + 0x60 => (0xc3ba, 2), // lowercase u, acute accent + 0x7b => (0xc3a7, 2), // lowercase c with cedilla + 0x7c => (0xc3b7, 2), // division symbol + 0x7d => (0xc391, 2), // uppercase N tilde + 0x7e => (0xc3b1, 2), // lowercase n tilde + 0x7f => (0xe296a0, 3), // Solid block + _ => (c as u32, 1), // Default: regular ASCII } } else { match c { // THIS BLOCK INCLUDES THE 16 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS // THAT COME FROM HI BYTE=0x11 AND LOW BETWEEN 0x30 AND 0x3F - 0x80 => 0xae as char, // Registered symbol (R) - 0x81 => 0xb0 as char, // degree sign - 0x82 => 0xbd as char, // 1/2 symbol - 0x83 => 0xbf as char, // Inverted (open) question mark - 0x84 => '™', // Trademark symbol (TM) - 0x85 => 0xa2 as char, // Cents symbol - 0x86 => 0xa3 as char, // Pounds sterling - 0x87 => 0xb6 as char, // Music note - Not in latin 1, so we use 'pilcrow' - 0x88 => 0xe0 as char, // lowercase a, grave accent - 0x89 => 0x20 as char, // transparent space, we make it regular - 0x8a => 0xe8 as char, // lowercase e, grave accent - 0x8b => 0xe2 as char, // lowercase a, circumflex accent - 0x8c => 0xea as char, // lowercase e, circumflex accent - 0x8d => 0xee as char, // lowercase i, circumflex accent - 0x8e => 0xf4 as char, // lowercase o, circumflex accent - 0x8f => 0xfb as char, // lowercase u, circumflex accent + 0x80 => (0xc2ae, 2), // Registered symbol (R) + 0x81 => (0xc2b0, 2), // degree sign + 0x82 => (0xc2bd, 2), // 1/2 symbol + 0x83 => (0xc2bf, 2), // Inverted (open) question mark + 0x84 => (0xe284a2, 3), // Trademark symbol (TM) + 0x85 => (0xc2a2, 2), // Cents symbol + 0x86 => (0xc2a3, 2), // Pounds sterling + 0x87 => (0xe299aa, 3), // Music note + 0x88 => (0xc3a0, 2), // lowercase a, grave accent + 0x89 => (0x20, 1), // transparent space, we make it regular + 0x8a => (0xc3a8, 2), // lowercase e, grave accent + 0x8b => (0xc3a2, 2), // lowercase a, circumflex accent + 0x8c => (0xc3aa, 2), // lowercase e, circumflex accent + 0x8d => (0xc3ae, 2), // lowercase i, circumflex accent + 0x8e => (0xc3b4, 2), // lowercase o, circumflex accent + 0x8f => (0xc3bb, 2), // lowercase u, circumflex accent + // THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS // THAT COME FROM HI BYTE=0x12 AND LOW BETWEEN 0x20 AND 0x3F - 0x90 => 0xc1 as char, // capital letter A with acute - 0x91 => 0xc9 as char, // capital letter E with acute - 0x92 => 0xd3 as char, // capital letter O with acute - 0x93 => 0xda as char, // capital letter U with acute - 0x94 => 0xdc as char, // capital letter U with diaeresis - 0x95 => 0xfc as char, // lowercase letter U with diaeresis - 0x96 => 0x27 as char, // apostrophe - 0x97 => 0xa1 as char, // inverted exclamation mark - 0x98 => 0x2a as char, // asterisk - 0x99 => 0x27 as char, // apostrophe (yes, duped). See CCADI source code. - 0x9a => 0x2d as char, // em dash - 0x9b => 0xa9 as char, // copyright sign - 0x9c => '℠', // Service Mark - 0x9d => 0x2e as char, // Full stop (.) - 0x9e => 0x22 as char, // Quotation mark - 0x9f => 0x22 as char, // Quotation mark - 0xa0 => 0xc0 as char, // uppercase A, grave accent - 0xa1 => 0xc2 as char, // uppercase A, circumflex - 0xa2 => 0xc7 as char, // uppercase C with cedilla - 0xa3 => 0xc8 as char, // uppercase E, grave accent - 0xa4 => 0xca as char, // uppercase E, circumflex - 0xa5 => 0xcb as char, // capital letter E with diaeresis - 0xa6 => 0xeb as char, // lowercase letter e with diaeresis - 0xa7 => 0xce as char, // uppercase I, circumflex - 0xa8 => 0xcf as char, // uppercase I, with diaeresis - 0xa9 => 0xef as char, // lowercase i, with diaeresis - 0xaa => 0xd4 as char, // uppercase O, circumflex - 0xab => 0xd9 as char, // uppercase U, grave accent - 0xac => 0xf9 as char, // lowercase u, grave accent - 0xad => 0xdb as char, // uppercase U, circumflex - 0xae => 0xab as char, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - 0xaf => 0xbb as char, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + 0x90 => (0xc381, 2), // capital letter A with acute + 0x91 => (0xc389, 2), // capital letter E with acute + 0x92 => (0xc393, 2), // capital letter O with acute + 0x93 => (0xc39a, 2), // capital letter U with acute + 0x94 => (0xc39c, 2), // capital letter U with diaeresis + 0x95 => (0xc3bc, 2), // lowercase letter U with diaeresis + 0x96 => (0x27, 1), // apostrophe + 0x97 => (0xc2a1, 2), // inverted exclamation mark + 0x98 => (0x2a, 1), // asterisk + 0x99 => (0x27, 1), // Plain single quote + 0x9a => (0xe28094, 3), // em dash + 0x9b => (0xc2a9, 2), // copyright sign + 0x9c => (0xe284a0, 3), // Service mark + 0x9d => (0xe280a2, 3), // Round bullet + 0x9e => (0xe2809c, 3), // Opening double quotes + 0x9f => (0xe2809d, 3), // Closing double quotes + 0xa0 => (0xc380, 2), // uppercase A, grave accent + 0xa1 => (0xc382, 2), // uppercase A, circumflex + 0xa2 => (0xc387, 2), // uppercase C with cedilla + 0xa3 => (0xc388, 2), // uppercase E, grave accent + 0xa4 => (0xc38a, 2), // uppercase E, circumflex + 0xa5 => (0xc38b, 2), // capital letter E with diaeresis + 0xa6 => (0xc3ab, 2), // lowercase letter e with diaeresis + 0xa7 => (0xc38e, 2), // uppercase I, circumflex + 0xa8 => (0xc38f, 2), // uppercase I, with diaeresis + 0xa9 => (0xc3af, 2), // lowercase i, with diaeresis + 0xaa => (0xc394, 2), // uppercase O, circumflex + 0xab => (0xc399, 2), // uppercase U, grave accent + 0xac => (0xc3b9, 2), // lowercase u, grave accent + 0xad => (0xc39b, 2), // uppercase U, circumflex + 0xae => (0xc2ab, 2), // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + 0xaf => (0xc2bb, 2), // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + // THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS // THAT COME FROM HI BYTE=0x13 AND LOW BETWEEN 0x20 AND 0x3F - 0xb0 => 0xc3 as char, // Uppercase A, tilde - 0xb1 => 0xe3 as char, // Lowercase a, tilde - 0xb2 => 0xcd as char, // Uppercase I, acute accent - 0xb3 => 0xcc as char, // Uppercase I, grave accent - 0xb4 => 0xec as char, // Lowercase i, grave accent - 0xb5 => 0xd2 as char, // Uppercase O, grave accent - 0xb6 => 0xf2 as char, // Lowercase o, grave accent - 0xb7 => 0xd5 as char, // Uppercase O, tilde - 0xb8 => 0xf5 as char, // Lowercase o, tilde - 0xb9 => 0x7b as char, // Open curly brace - 0xba => 0x7d as char, // Closing curly brace - 0xbb => 0x5c as char, // Backslash - 0xbc => 0x5e as char, // Caret - 0xbd => 0x5f as char, // Underscore - 0xbe => 0xa6 as char, // Pipe (broken bar) - 0xbf => 0x7e as char, // Tilde - 0xc0 => 0xc4 as char, // Uppercase A, umlaut - 0xc1 => 0xe3 as char, // Lowercase A, umlaut - 0xc2 => 0xd6 as char, // Uppercase O, umlaut - 0xc3 => 0xf6 as char, // Lowercase o, umlaut - 0xc4 => 0xdf as char, // Eszett (sharp S) - 0xc5 => 0xa5 as char, // Yen symbol - 0xc6 => 0xa4 as char, // Currency symbol - 0xc7 => 0x7c as char, // Vertical bar - 0xc8 => 0xc5 as char, // Uppercase A, ring - 0xc9 => 0xe5 as char, // Lowercase A, ring - 0xca => 0xd8 as char, // Uppercase O, slash - 0xcb => 0xf8 as char, // Lowercase o, slash - 0xcc => '⌜', // Top left corner - 0xcd => '⌝', // Top right corner - 0xce => '⌞', // Bottom left corner - 0xcf => '⌟', // Bottom right corner - _ => UNAVAILABLE_CHAR as char, // For those that don't have representation - // I'll do it eventually, I promise - // This are weird chars anyway + 0xb0 => (0xc383, 2), // Uppercase A, tilde + 0xb1 => (0xc3a3, 2), // Lowercase a, tilde + 0xb2 => (0xc38d, 2), // Uppercase I, acute accent + 0xb3 => (0xc38c, 2), // Uppercase I, grave accent + 0xb4 => (0xc3ac, 2), // Lowercase i, grave accent + 0xb5 => (0xc392, 2), // Uppercase O, grave accent + 0xb6 => (0xc3b2, 2), // Lowercase o, grave accent + 0xb7 => (0xc395, 2), // Uppercase O, tilde + 0xb8 => (0xc3b5, 2), // Lowercase o, tilde + 0xb9 => (0x7b, 1), // Open curly brace + 0xba => (0x7d, 1), // Closing curly brace + 0xbb => (0x5c, 1), // Backslash + 0xbc => (0x5e, 1), // Caret + 0xbd => (0x5f, 1), // Underscore + 0xbe => (0xc2a6, 2), // Pipe (broken bar) + 0xbf => (0x7e, 1), // Tilde + 0xc0 => (0xc384, 2), // Uppercase A, umlaut + 0xc1 => (0xc3a4, 2), // Lowercase A, umlaut + 0xc2 => (0xc396, 2), // Uppercase O, umlaut + 0xc3 => (0xc3b6, 2), // Lowercase o, umlaut + 0xc4 => (0xc39f, 2), // Esszett (sharp S) + 0xc5 => (0xc2a5, 2), // Yen symbol + 0xc6 => (0xc2a4, 2), // Currency symbol + 0xc7 => (0x7c, 1), // Vertical bar + 0xc8 => (0xc385, 2), // Uppercase A, ring + 0xc9 => (0xc3a5, 2), // Lowercase A, ring + 0xca => (0xc398, 2), // Uppercase O, slash + 0xcb => (0xc3b8, 2), // Lowercase o, slash + 0xcc => (0xe28c9c, 3), // Top left corner + 0xcd => (0xe28c9d, 3), // Top right corner + 0xce => (0xe28c9e, 3), // Bottom left corner + 0xcf => (0xe28c9f, 3), // Bottom right corner + _ => (b'?' as u32, 1), // I'll do it eventually, I promise + // This are weird chars anyway } } } +pub fn line21_to_char(c: Line21Char) -> char { + let (utf8_packed, byte_count) = line21_to_utf8(c); + + // Extract bytes and create a UTF-8 string + let mut bytes = Vec::new(); + match byte_count { + 1 => bytes.push(utf8_packed as u8), + 2 => { + bytes.push((utf8_packed >> 8) as u8); + bytes.push(utf8_packed as u8); + } + 3 => { + bytes.push((utf8_packed >> 16) as u8); + bytes.push((utf8_packed >> 8) as u8); + bytes.push(utf8_packed as u8); + } + _ => return '?', // Invalid byte count + } -fn line21_to_ucs2(c: Line21Char) -> Ucs2Char { + // Convert UTF-8 bytes to char + match std::str::from_utf8(&bytes) { + Ok(s) => s.chars().next().unwrap_or('?'), + Err(_) => '?', + } +} +pub fn line21_to_ucs2(c: Line21Char) -> Ucs2Char { match c { 0x7f => 0x25A0, // Solid block 0x84 => 0x2122, // Trademark symbol (TM) @@ -905,7 +932,7 @@ fn ucs2_to_line21(c: Ucs2Char) -> Line21Char { } } -fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char { +pub fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char { // Code points 0 to U+00FF are the same in both. if c < 0xff { c as u8 @@ -1004,6 +1031,6 @@ fn ucs2_to_char(c: Ucs2Char) -> char { char::from_u32(x).unwrap_or(UNAVAILABLE_CHAR.into()) } -fn char_to_ucs2(c: char) -> Ucs2Char { +pub fn char_to_ucs2(c: char) -> Ucs2Char { (c as u32).try_into().unwrap_or(UNAVAILABLE_CHAR.into()) } diff --git a/src/rust/src/encoder/common.rs b/src/rust/src/encoder/common.rs new file mode 100644 index 000000000..326bcca4f --- /dev/null +++ b/src/rust/src/encoder/common.rs @@ -0,0 +1,484 @@ +use lib_ccxr::info; +use lib_ccxr::util::encoding::*; +use std::cmp; +use std::convert::TryFrom; + +pub enum EncoderError { + Retry = -100, // CCX_EAGAIN + EOF = -101, // CCX_EOF + InvalidArgument = -102, // CCX_EINVAL + Unsupported = -103, // CCX_ENOSUPP + OutOfMemory = -104, // CCX_ENOMEM +} +fn find_limit_characters( + line: &[u8], + first_non_blank: &mut i32, + last_non_blank: &mut i32, + max_len: usize, +) { + // initialize to -1 (meaning “not found”) + *first_non_blank = -1; + *last_non_blank = -1; + + // clamp to avoid out‑of‑bounds + let limit = cmp::min(line.len(), max_len); + + for (i, &c) in line.iter().take(limit).enumerate() { + // detect non‑blank + if c != b' ' && c != 0x89 { + if *first_non_blank < 0 { + *first_non_blank = i as i32; + } + *last_non_blank = i as i32; + } + // break on end‑of‑string or newline + if c == b'\0' || c == b'\n' || c == b'\r' { + break; + } + } +} +fn change_utf8_encoding(dest: &mut [u8], src: &[u8], len: i32, out_enc: Encoding) -> i32 { + let mut dest_idx = 0; + let mut src_idx = 0; + let max = usize::min(src.len(), len as usize); + + while src_idx < max { + let c = src[src_idx]; + let c_len: usize; + + if c < 0x80 { + c_len = 1; + } else if (c & 0x20) == 0 { + c_len = 2; + } else if (c & 0x10) == 0 { + c_len = 3; + } else if (c & 0x08) == 0 { + c_len = 4; + } else if (c & 0x04) == 0 { + c_len = 5; + } else { + c_len = 1; // Invalid UTF-8, treat as single byte + } + + match out_enc { + Encoding::Utf8 => { + let to_copy = max; + if to_copy <= dest.len() { + dest[..to_copy].copy_from_slice(&src[..to_copy]); + return to_copy as i32; + } else { + return EncoderError::Unsupported as i32; + } + } + Encoding::Latin1 => { + if c_len == 1 { + dest[dest_idx] = src[src_idx]; + dest_idx += 1; + } else if c_len == 2 { + if (src[src_idx + 1] & 0x40) == 0 { + let cp = utf8_to_latin1_map( + (((src[src_idx] & 0x1F) as u32) << 6) + | ((src[src_idx + 1] & 0x3F) as u32), + ) as u16; + if cp <= 255 { + dest[dest_idx] = cp as u8; + } else { + dest[dest_idx] = b'?'; + } + dest_idx += 1; + } else { + dest[dest_idx] = b'?'; + dest_idx += 1; + } + } else if c_len == 3 { + if (src[src_idx + 1] & 0x40) == 0 && (src[src_idx + 2] & 0x40) == 0 { + let cp = utf8_to_latin1_map( + (((src[src_idx] & 0x0F) as u32) << 12) + | (((src[src_idx + 1] & 0x3F) as u32) << 6) + | ((src[src_idx + 2] & 0x3F) as u32), + ) as u16; + if cp <= 255 { + dest[dest_idx] = cp as u8; + } else { + dest[dest_idx] = b'?'; + } + dest_idx += 1; + } else { + dest[dest_idx] = b'?'; + dest_idx += 1; + } + } else if c_len == 4 { + if (src[src_idx + 1] & 0x40) == 0 + && (src[src_idx + 2] & 0x40) == 0 + && (src[src_idx + 3] & 0x40) == 0 + { + let cp = utf8_to_latin1_map( + (((src[src_idx] & 0x07) as u32) << 18) + | (((src[src_idx + 1] & 0x3F) as u32) << 12) + | (((src[src_idx + 2] & 0x3F) as u32) << 6) + | ((src[src_idx + 3] & 0x3F) as u32), + ) as u16; + if cp <= 255 { + dest[dest_idx] = cp as u8; + } else { + dest[dest_idx] = b'?'; + } + dest_idx += 1; + } else { + dest[dest_idx] = b'?'; + dest_idx += 1; + } + } else if c_len == 5 { + if (src[src_idx + 1] & 0x40) == 0 + && (src[src_idx + 2] & 0x40) == 0 + && (src[src_idx + 3] & 0x40) == 0 + && (src[src_idx + 4] & 0x40) == 0 + { + let cp = utf8_to_latin1_map( + (((src[src_idx] & 0x03) as u32) << 24u32) + | (((src[src_idx + 1] & 0x3F) as u32) << 18u32) + | (((src[src_idx + 2] & 0x3F) as u32) << 12u32) + | (((src[src_idx + 3] & 0x3F) as u32) << 6u32) + | ((src[src_idx + 4] & 0x3F) as u32), + ) as u16; + if cp <= 255 { + dest[dest_idx] = cp as u8; + } else { + dest[dest_idx] = b'?'; + } + dest_idx += 1; + } else { + dest[dest_idx] = b'?'; + dest_idx += 1; + } + } else { + dest[dest_idx] = b'?'; + dest_idx += 1; + } + } + Encoding::Ucs2 => { + return EncoderError::Unsupported as i32; + } + Encoding::Line21 => { + if c_len == 1 { + dest[dest_idx] = src[src_idx]; + dest_idx += 1; + } else { + dest[dest_idx] = b'?'; + dest_idx += 1; + } + } + } + src_idx += c_len; + } + + if dest_idx < dest.len() { + dest[dest_idx] = 0; + } + dest_idx as i32 +} +#[allow(unused_variables)] +fn change_latin1_encoding(dest: &mut [u8], src: &[u8], len: i32, out_enc: Encoding) -> i32 { + EncoderError::Unsupported as i32 +} + +#[allow(unused_variables)] +fn change_unicode_encoding(dest: &mut [u8], src: &[u8], len: i32, out_enc: Encoding) -> i32 { + EncoderError::Unsupported as i32 +} + +pub fn change_ascii_encoding( + dest: &mut Vec, + src: &[u8], + out_enc: Encoding, +) -> Result { + dest.clear(); + + for &c in src { + match out_enc { + Encoding::Utf8 => { + let (utf8_packed, byte_count) = line21_to_utf8(c); + + // Extract bytes based on count (big-endian storage) + match byte_count { + 1 => dest.push(utf8_packed as u8), + 2 => { + dest.push((utf8_packed >> 8) as u8); + dest.push(utf8_packed as u8); + } + 3 => { + dest.push((utf8_packed >> 16) as u8); + dest.push((utf8_packed >> 8) as u8); + dest.push(utf8_packed as u8); + } + _ => return Err(-1), // Invalid byte count + } + } + Encoding::Latin1 => { + let latin1_char = line21_to_latin1(c); + dest.push(latin1_char); + } + Encoding::Ucs2 => { + let ucs2_char = line21_to_ucs2(c); + // UCS-2 is 2 bytes, little-endian + dest.extend_from_slice(&ucs2_char.to_le_bytes()); + } + Encoding::Line21 => { + dest.extend_from_slice(src); + return Ok(src.len()); + } + } + } + + // Add null terminator + dest.push(0); + + Ok(dest.len() - 1) // Return length without null terminator +} + +fn utf8_to_latin1_map(character: u32) -> Latin1Char { + ucs2_to_latin1(char_to_ucs2(char::try_from(character).unwrap())) +} +pub fn get_str_basic( + out_buffer: &mut Vec, + in_buffer: &[u8], + trim_subs: bool, + in_enc: Encoding, + out_enc: Encoding, + max_len: i32, +) -> i32 { + let mut last_non_blank: i32 = -1; + let mut first_non_blank: i32 = -1; + let len; + + find_limit_characters( + in_buffer, + &mut first_non_blank, + &mut last_non_blank, + max_len as usize, + ); + + if first_non_blank == -1 { + // No non-blank characters found, return empty + out_buffer.clear(); + out_buffer.push(0); // null terminator only + return 0; + } + + // Calculate the actual content length (without trailing spaces) + let mut content_length = last_non_blank - first_non_blank + 1; + + if !trim_subs { + first_non_blank = 0; + // If not trimming, we need to recalculate to include leading content + content_length = last_non_blank + 1; + } + + if (first_non_blank + content_length) as usize > in_buffer.len() { + out_buffer.clear(); + out_buffer.push(0); + return 0; + } + + // Clear the output buffer + out_buffer.clear(); + + // change encoding only when required + match in_enc { + Encoding::Utf8 => { + let mut temp_buffer = vec![0u8; (content_length * 4) as usize]; // Allow extra space for multi-byte encodings + len = change_utf8_encoding( + &mut temp_buffer, + &in_buffer[first_non_blank as usize..], + content_length, + out_enc, + ); + if len > 0 { + out_buffer.extend_from_slice(&temp_buffer[..len as usize]); + } + } + Encoding::Latin1 => { + let mut temp_buffer = vec![0u8; (content_length * 4) as usize]; + len = change_latin1_encoding( + &mut temp_buffer, + &in_buffer[first_non_blank as usize..], + content_length, + out_enc, + ); + if len > 0 { + out_buffer.extend_from_slice(&temp_buffer[..len as usize]); + } + } + Encoding::Ucs2 => { + let mut temp_buffer = vec![0u8; (content_length * 4) as usize]; + len = change_unicode_encoding( + &mut temp_buffer, + &in_buffer[first_non_blank as usize..], + content_length, + out_enc, + ); + if len > 0 { + out_buffer.extend_from_slice(&temp_buffer[..len as usize]); + } + } + Encoding::Line21 => { + let input_slice = + &in_buffer[first_non_blank as usize..(first_non_blank + content_length) as usize]; + len = change_ascii_encoding(out_buffer, input_slice, out_enc) + .unwrap_or(EncoderError::Retry as usize) as i32; + } + } + + if len < 0 { + info!("WARNING: Could not encode in specified format\n"); + out_buffer.clear(); + out_buffer.push(0); + 0 + } else if len == EncoderError::Unsupported as i32 { + info!("WARNING: Encoding is not yet supported\n"); + out_buffer.clear(); + out_buffer.push(0); + return 0; + } else { + // Add null terminator + out_buffer.push(0); + return len; // Return actual content length, not max_len + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_line_with_content() { + let line = b" hello world \n"; + let mut first_non_blank = 0; + let mut last_non_blank = 0; + + find_limit_characters(line, &mut first_non_blank, &mut last_non_blank, 15); + + assert_eq!(first_non_blank, 2); // 'h' is at index 2 + assert_eq!(last_non_blank, 12); // 'd' is at index 12 + } + + #[test] + fn test_line_with_special_chars() { + let line = b" \x89 abc \x89 def \r"; + let mut first_non_blank = 0; + let mut last_non_blank = 0; + + find_limit_characters(line, &mut first_non_blank, &mut last_non_blank, 20); + + assert_eq!(first_non_blank, 3); // 'a' is at index 3 + } + #[test] + fn test_utf8_to_utf8() { + let src = b"Hello, \xC3\xA9world!"; // "Hello, éworld!" + let mut dest = [0u8; 20]; + + let result = change_utf8_encoding(&mut dest, src, src.len() as i32, Encoding::Utf8); + + assert_eq!(result, src.len() as i32); + assert_eq!(&dest[..src.len()], src); + } + + #[test] + fn test_utf8_to_ascii() { + let src = b"Hello, \xC3\xA9world!"; // "Hello, éworld!" + let mut dest = [0u8; 20]; + + let result = change_utf8_encoding(&mut dest, src, src.len() as i32, Encoding::Line21); + + assert_eq!(result, 14); // "Hello, ?world!" (14 chars) + assert_eq!(&dest[..14], b"Hello, ?world!"); + } + + #[test] + fn test_unsupported_encoding() { + let src = b"Hello"; + let mut dest = [0u8; 10]; + + let result = change_utf8_encoding(&mut dest, src, src.len() as i32, Encoding::Ucs2); + + assert_eq!(result, EncoderError::Unsupported as i32); + } + #[test] + fn test_ascii_to_ascii() { + let src = b"Hello World!"; + let mut dest = Vec::with_capacity(20); + let result = change_ascii_encoding(&mut dest, src, Encoding::Line21); + + assert_eq!(result.unwrap(), src.len()); + assert_eq!(&dest[..src.len()], src); + } + + #[test] + fn test_ascii_to_utf8() { + let src = b"Hello"; + let mut dest = Vec::with_capacity(20); + + let result = change_ascii_encoding(&mut dest, src, Encoding::Utf8); + + assert_eq!(result.unwrap(), 5); // Each ASCII char becomes 1 UTF-8 byte + assert_eq!(&dest[..5], b"Hello"); + assert_eq!(dest[5], 0); // Null terminator + } + + #[test] + fn test_ascii_to_latin1() { + let src = b"Test"; + let mut dest = Vec::with_capacity(20); + + let result = change_ascii_encoding(&mut dest, src, Encoding::Latin1); + + assert_eq!(result.unwrap(), 4); + assert_eq!(&dest[..4], b"Test"); + assert_eq!(dest[4], 0); // Null terminator + } + + #[test] + fn test_ascii_to_unicode() { + let src = b"Hi"; + let mut dest = Vec::with_capacity(20); + + let result = change_ascii_encoding(&mut dest, src, Encoding::Ucs2); + + assert_eq!(result.unwrap(), 4); // Each ASCII char becomes 2 Unicode bytes + assert_eq!(dest[4], 0); // Null terminator + } + + #[test] + fn test_get_str_basic_with_trim() { + let in_buffer = b" Hello \0"; + let mut out_buffer = Vec::with_capacity(20); + + let result = get_str_basic( + &mut out_buffer, + in_buffer, + true, + Encoding::Line21, + Encoding::Line21, + 10, + ); + + assert!(result > 0); + } + + #[test] + fn test_get_str_basic_without_trim() { + let in_buffer = b" Hello \0"; + let mut out_buffer = Vec::with_capacity(20); + + let result = get_str_basic( + &mut out_buffer, + in_buffer, + false, + Encoding::Line21, + Encoding::Line21, + 10, + ); + + assert!(result > 0); + } +} diff --git a/src/rust/src/encoder/g608.rs b/src/rust/src/encoder/g608.rs new file mode 100644 index 000000000..6f886a11d --- /dev/null +++ b/src/rust/src/encoder/g608.rs @@ -0,0 +1,311 @@ +use crate::bindings::{ + ccx_encoding_type_CCX_ENC_ASCII, ccx_encoding_type_CCX_ENC_LATIN_1, + ccx_encoding_type_CCX_ENC_UNICODE, ccx_encoding_type_CCX_ENC_UTF_8, eia608_screen, encoder_ctx, + font_bits_FONT_ITALICS, font_bits_FONT_REGULAR, font_bits_FONT_UNDERLINED, + font_bits_FONT_UNDERLINED_ITALICS, +}; +use crate::encoder::headers_and_footers::{encode_line, write_raw}; +use crate::libccxr_exports::time::ccxr_millis_to_time; +use lib_ccxr::util::encoding::{line21_to_latin1, line21_to_ucs2, line21_to_utf8}; +use std::io; +use std::os::raw::{c_int, c_uchar, c_uint, c_void}; + +/// Write data to file descriptor with retry logic +pub fn write_wrapped(fd: c_int, buf: &[u8]) -> Result<(), io::Error> { + let mut remaining = buf.len(); + let mut current_buf = buf.as_ptr(); + + while remaining > 0 { + let written = write_raw(fd, current_buf as *const c_void, remaining); + if written == -1 { + return Err(io::Error::last_os_error()); + } + let bytes_written = written as usize; + unsafe { + current_buf = current_buf.add(bytes_written); + } + remaining -= bytes_written; + } + + Ok(()) +} + +/// Get line characters encoded according to context encoding +pub fn get_line_encoded( + ctx: &encoder_ctx, + buffer: &mut [c_uchar], + line_num: usize, + data: &eia608_screen, +) -> c_uint { + let mut buffer_pos = 0; + let line = &data.characters[line_num]; + + for i in line.iter().take(33) { + let bytes_written = match ctx.encoding { + ccx_encoding_type_CCX_ENC_UTF_8 => { + let (utf8_packed, byte_count) = line21_to_utf8(*i); + + // Extract bytes based on count (big-endian storage) + match byte_count { + 1 => { + if buffer_pos < buffer.len() { + buffer[buffer_pos] = utf8_packed as c_uchar; + 1 + } else { + 0 + } + } + 2 => { + if buffer_pos + 1 < buffer.len() { + buffer[buffer_pos] = (utf8_packed >> 8) as c_uchar; + buffer[buffer_pos + 1] = utf8_packed as c_uchar; + 2 + } else { + 0 + } + } + 3 => { + if buffer_pos + 2 < buffer.len() { + buffer[buffer_pos] = (utf8_packed >> 16) as c_uchar; + buffer[buffer_pos + 1] = (utf8_packed >> 8) as c_uchar; + buffer[buffer_pos + 2] = utf8_packed as c_uchar; + 3 + } else { + 0 + } + } + _ => 0, // Invalid byte count + } + } + ccx_encoding_type_CCX_ENC_LATIN_1 => { + if buffer_pos < buffer.len() { + let latin1_char = line21_to_latin1(*i); + buffer[buffer_pos] = latin1_char as c_uchar; + 1 + } else { + 0 + } + } + ccx_encoding_type_CCX_ENC_UNICODE => { + if buffer_pos + 1 < buffer.len() { + let ucs2_char = line21_to_ucs2(*i); + // UCS-2 is 2 bytes, little-endian + let bytes = ucs2_char.to_le_bytes(); + buffer[buffer_pos] = bytes[0] as c_uchar; + buffer[buffer_pos + 1] = bytes[1] as c_uchar; + 2 + } else { + 0 + } + } + ccx_encoding_type_CCX_ENC_ASCII => { + if buffer_pos < buffer.len() { + buffer[buffer_pos] = *i; + 1 + } else { + 0 + } + } + _ => { + 0 // This should never be reached + } + }; + + buffer_pos += bytes_written; + + // Break if we've run out of buffer space + if bytes_written == 0 { + break; + } + } + + buffer_pos as c_uint +} +/// Get color information encoded as characters +pub fn get_color_encoded( + _ctx: &encoder_ctx, + buffer: &mut [c_uchar], + line_num: usize, + data: &eia608_screen, +) -> c_uint { + let mut buffer_pos = 0; + for i in 0..32 { + if buffer_pos >= buffer.len() { + break; + } + let color_val = data.colors[line_num][i] as u8; + buffer[buffer_pos] = if color_val < 10 { + color_val + b'0' + } else { + b'E' + }; + buffer_pos += 1; + } + if buffer_pos < buffer.len() { + buffer[buffer_pos] = 0; + } + buffer_pos as c_uint +} + +/// Get font information encoded as characters +pub fn get_font_encoded( + _ctx: &encoder_ctx, + buffer: &mut [c_uchar], + line_num: usize, + data: &eia608_screen, +) -> c_uint { + let mut buffer_pos = 0; + + for i in 0..32 { + if buffer_pos >= buffer.len() { + break; + } + + let font_val = data.fonts[line_num][i]; + buffer[buffer_pos] = match font_val { + font_bits_FONT_REGULAR => b'R', + font_bits_FONT_UNDERLINED_ITALICS => b'B', + font_bits_FONT_UNDERLINED => b'U', + font_bits_FONT_ITALICS => b'I', + _ => b'E', + }; + buffer_pos += 1; + } + + buffer_pos as c_uint +} + +/// Write CC buffer in G608 format +pub fn write_cc_buffer_as_g608(data: &eia608_screen, context: &mut encoder_ctx) -> c_int { + let mut wrote_something = 0; + + // Convert start and end times + let mut h1: u32 = 0; + let mut m1: u32 = 0; + let mut s1: u32 = 0; + let mut ms1: u32 = 0; + + let mut h2: u32 = 0; + let mut m2: u32 = 0; + let mut s2: u32 = 0; + let mut ms2: u32 = 0; + + unsafe { + ccxr_millis_to_time(data.start_time, &mut h1, &mut m1, &mut s1, &mut ms1); + ccxr_millis_to_time(data.end_time - 1, &mut h2, &mut m2, &mut s2, &mut ms2); + } + + // Increment counter + context.srt_counter += 1; + + // Create timeline string for counter + let counter_line = format!("{}{}", context.srt_counter, unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts( + context.encoded_crlf.as_ptr(), + context.encoded_crlf_length as usize, + )) + }); + + // Encode and write counter line + let buffer_slice = + unsafe { std::slice::from_raw_parts_mut(context.buffer, context.capacity as usize) }; + let used = encode_line(context, buffer_slice, counter_line.as_bytes()); + + if write_wrapped(unsafe { (*context.out).fh }, &buffer_slice[..used as usize]).is_err() { + return 0; + } + + // Create timeline string for timestamps + let timestamp_line = format!( + "{:02}:{:02}:{:02},{:03} --> {:02}:{:02}:{:02},{:03}{}", + h1, + m1, + s1, + ms1, + h2, + m2, + s2, + ms2, + unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts( + context.encoded_crlf.as_ptr(), + context.encoded_crlf_length as usize, + )) + } + ); + + // Encode and write timestamp line + let used = encode_line(context, buffer_slice, timestamp_line.as_bytes()); + + if write_wrapped(unsafe { (*context.out).fh }, &buffer_slice[..used as usize]).is_err() { + return 0; + } + + // Write all 15 lines with their encoding information + for i in 0..15 { + let subline_slice = unsafe { + std::slice::from_raw_parts_mut(context.subline, 1024) // temporary, should be inputted after encoder_ctx => EncoderCtx + }; + + // Get line encoded + let length = get_line_encoded(context, subline_slice, i, data); + if write_wrapped( + unsafe { (*context.out).fh }, + &subline_slice[..length as usize], + ) + .is_err() + { + return 0; + } + + // Get color encoded + let length = get_color_encoded(context, subline_slice, i, data); + if write_wrapped( + unsafe { (*context.out).fh }, + &subline_slice[..length as usize], + ) + .is_err() + { + return 0; + } + + // Get font encoded + let length = get_font_encoded(context, subline_slice, i, data); + if write_wrapped( + unsafe { (*context.out).fh }, + &subline_slice[..length as usize], + ) + .is_err() + { + return 0; + } + + // Write CRLF + let crlf_slice = unsafe { + std::slice::from_raw_parts( + context.encoded_crlf.as_ptr(), + context.encoded_crlf_length as usize, + ) + }; + + if write_wrapped(unsafe { (*context.out).fh }, crlf_slice).is_err() { + return 0; + } + + wrote_something = 1; + } + + // Write final CRLF + let crlf_slice = unsafe { + std::slice::from_raw_parts( + context.encoded_crlf.as_ptr(), + context.encoded_crlf_length as usize, + ) + }; + + if write_wrapped(unsafe { (*context.out).fh }, crlf_slice).is_err() { + return 0; + } + + wrote_something +} diff --git a/src/rust/src/encoder/headers_and_footers.rs b/src/rust/src/encoder/headers_and_footers.rs new file mode 100644 index 000000000..09d9147a6 --- /dev/null +++ b/src/rust/src/encoder/headers_and_footers.rs @@ -0,0 +1,545 @@ +#![allow(dead_code)] +use crate::bindings::{ + ccx_encoding_type_CCX_ENC_LATIN_1, ccx_encoding_type_CCX_ENC_UNICODE, + ccx_encoding_type_CCX_ENC_UTF_8, ccx_output_format, ccx_s_write, encoder_ctx, net_send_header, + write_spumux_footer, write_spumux_header, +}; +use crate::ccx_options; +use lib_ccxr::common::{BROADCAST_HEADER, LITTLE_ENDIAN_BOM, UTF8_BOM}; +use lib_ccxr::util::log::DebugMessageFlag; +use lib_ccxr::{debug, info}; +use std::alloc::{alloc, dealloc, Layout}; +use std::fs::File; +use std::io::Write; +#[cfg(unix)] +use std::os::fd::FromRawFd; +use std::os::raw::{c_int, c_uchar, c_uint, c_void}; +#[cfg(windows)] +use std::os::windows::io::FromRawHandle; +use std::ptr; +const CCD_HEADER: &[u8] = b"SCC_disassembly V1.2"; +const SCC_HEADER: &[u8] = b"Scenarist_SCC V1.0"; + +const SSA_HEADER: &str = "[Script Info]\n\ +Title: Default file\n\ +ScriptType: v4.00+\n\ +\n\ +[V4+ Styles]\n\ +Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n\ +Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,0\n\ +\n\ +[Events]\n\ +Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\ +\n"; +const SAMI_HEADER: &str = "\n\ +\n\ +\n\ +\n\n\ +\n"; +const SMPTETT_HEADER: &str = "\n \n \n \n