Skip to content

[Rust] Added Encoder Module #1710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/CHANGES.TXT
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
1.0 (to be released)
-----------------
- New: Add Encoder Module to Rust
- Fix: Segmentation faults on XDS files
- Fix: Clippy Errors Based on Rust 1.88
- IMPROVEMENT: Refactor and optimize Dockerfile
Expand Down
9 changes: 8 additions & 1 deletion src/lib_ccx/ccx_encoders_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ int fsync(int fd)
}
#endif

#ifndef DISABLE_RUST
int ccxr_get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs,
enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len);
#endif
// These are the default settings for plain transcripts. No times, no CC or caption mode, and no XDS.
ccx_encoders_transcript_format ccx_encoders_default_transcript_settings =
{
Expand Down Expand Up @@ -293,6 +297,9 @@ int change_ascii_encoding(unsigned char *dest, unsigned char *src, int len, enum
int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_subs,
enum ccx_encoding_type in_enc, enum ccx_encoding_type out_enc, int max_len)
{
#ifndef DISABLE_RUST
return ccxr_get_str_basic(out_buffer, in_buffer, trim_subs, in_enc, out_enc, max_len);
#else
int last_non_blank = -1;
int first_non_blank = -1;
int len = 0;
Expand All @@ -305,7 +312,6 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_
*out_buffer = 0;
return 0;
}

// change encoding only when required
switch (in_enc)
{
Expand All @@ -331,6 +337,7 @@ int get_str_basic(unsigned char *out_buffer, unsigned char *in_buffer, int trim_
return (unsigned)len; // Return length

return 0; // Return length
#endif
}

int write_subtitle_file_footer(struct encoder_ctx *ctx, struct ccx_s_write *out)
Expand Down
12 changes: 0 additions & 12 deletions src/lib_ccx/ccx_encoders_spupng.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,6 @@ FT_Face face_regular = NULL;
FT_Face face_italics = NULL;
FT_Face face = NULL;

struct spupng_t
{
FILE *fpxml;
FILE *fppng;
char *dirname;
char *pngfile;
char *relative_path_png;
int fileIndex;
int xOffset;
int yOffset;
};

#define CCPL (ccfont2_width / CCW * ccfont2_height / CCH)

static int initialized = 0;
Expand Down
12 changes: 12 additions & 0 deletions src/lib_ccx/ccx_encoders_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,16 @@ struct ccx_s_write

};

struct spupng_t
{
FILE *fpxml;
FILE *fppng;
char *dirname;
char *pngfile;
char *relative_path_png;
int fileIndex;
int xOffset;
int yOffset;
};

#endif
4 changes: 4 additions & 0 deletions src/rust/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ fn main() {
"writercwtdata",
"version",
"set_binary_mode",
"net_send_header", // shall be removed after NET
"write_spumux_footer",
"write_spumux_header",
]);

#[cfg(feature = "hardsubx_ocr")]
Expand Down Expand Up @@ -39,6 +42,7 @@ fn main() {
"ccx_encoding_type",
"ccx_decoder_608_settings",
"ccx_decoder_608_report",
"eia608_screen",
"uint8_t",
"word_list",
]);
Expand Down
229 changes: 128 additions & 101 deletions src/rust/lib_ccxr/src/util/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ pub type Latin1Char = u8;

/// Represents a character in UCS-2 encoding.
pub type Ucs2Char = u16;
/// Represents a character in UTF-8 encoding.
pub type Utf8Char = u32;

/// A String-like type containing a sequence of Line 21 encoded characters.
#[derive(Clone, Debug, Eq, PartialEq, Default)]
Expand Down Expand Up @@ -339,7 +341,7 @@ impl From<&str> for Ucs2String {

impl From<&Line21String> for String {
fn from(value: &Line21String) -> String {
value.as_vec().iter().map(|&c| line21_to_utf8(c)).collect()
value.as_vec().iter().map(|&c| line21_to_char(c)).collect()
}
}

Expand Down Expand Up @@ -480,7 +482,7 @@ impl EncodedString {
}

/// Converts this [`EncodedString`] to a format provided by `encoding`, returning a new [`EncodedString`].
///
///
/// # Examples
/// ```rust
/// # use lib_ccxr::util::encoding::*;
Expand Down Expand Up @@ -653,7 +655,7 @@ fn latin1_to_line21(c: Latin1Char) -> Line21Char {
}
}

fn line21_to_latin1(c: Line21Char) -> Latin1Char {
pub fn line21_to_latin1(c: Line21Char) -> Latin1Char {
if c < 0x80 {
// Regular line-21 character set, mostly ASCII except these exceptions
match c {
Expand Down Expand Up @@ -764,118 +766,143 @@ fn line21_to_latin1(c: Line21Char) -> Latin1Char {
}
}

fn line21_to_utf8(c: Line21Char) -> char {
pub fn line21_to_utf8(c: Line21Char) -> (u32, usize) {
if c < 0x80 {
// Regular line-21 character set, mostly ASCII except these exceptions
match c {
0x2a => 0xe1 as char, // lowercase a, acute accent
0x5c => 0xe9 as char, // lowercase e, acute accent
0x5e => 0xed as char, // lowercase i, acute accent
0x5f => 0xf3 as char, // lowercase o, acute accent
0x60 => 0xfa as char, // lowercase u, acute accent
0x7b => 0xe7 as char, // lowercase c with cedilla
0x7c => 0xf7 as char, // division symbol
0x7d => 0xd1 as char, // uppercase N tilde
0x7e => 0xf1 as char, // lowercase n tilde
0x7f => '■', // Solid block
_ => c as char,
0x2a => (0xc3a1, 2), // lowercase a, acute accent
0x5c => (0xc3a9, 2), // lowercase e, acute accent
0x5e => (0xc3ad, 2), // lowercase i, acute accent
0x5f => (0xc3b3, 2), // lowercase o, acute accent
0x60 => (0xc3ba, 2), // lowercase u, acute accent
0x7b => (0xc3a7, 2), // lowercase c with cedilla
0x7c => (0xc3b7, 2), // division symbol
0x7d => (0xc391, 2), // uppercase N tilde
0x7e => (0xc3b1, 2), // lowercase n tilde
0x7f => (0xe296a0, 3), // Solid block
_ => (c as u32, 1), // Default: regular ASCII
}
} else {
match c {
// THIS BLOCK INCLUDES THE 16 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS
// THAT COME FROM HI BYTE=0x11 AND LOW BETWEEN 0x30 AND 0x3F
0x80 => 0xae as char, // Registered symbol (R)
0x81 => 0xb0 as char, // degree sign
0x82 => 0xbd as char, // 1/2 symbol
0x83 => 0xbf as char, // Inverted (open) question mark
0x84 => '™', // Trademark symbol (TM)
0x85 => 0xa2 as char, // Cents symbol
0x86 => 0xa3 as char, // Pounds sterling
0x87 => 0xb6 as char, // Music note - Not in latin 1, so we use 'pilcrow'
0x88 => 0xe0 as char, // lowercase a, grave accent
0x89 => 0x20 as char, // transparent space, we make it regular
0x8a => 0xe8 as char, // lowercase e, grave accent
0x8b => 0xe2 as char, // lowercase a, circumflex accent
0x8c => 0xea as char, // lowercase e, circumflex accent
0x8d => 0xee as char, // lowercase i, circumflex accent
0x8e => 0xf4 as char, // lowercase o, circumflex accent
0x8f => 0xfb as char, // lowercase u, circumflex accent
0x80 => (0xc2ae, 2), // Registered symbol (R)
0x81 => (0xc2b0, 2), // degree sign
0x82 => (0xc2bd, 2), // 1/2 symbol
0x83 => (0xc2bf, 2), // Inverted (open) question mark
0x84 => (0xe284a2, 3), // Trademark symbol (TM)
0x85 => (0xc2a2, 2), // Cents symbol
0x86 => (0xc2a3, 2), // Pounds sterling
0x87 => (0xe299aa, 3), // Music note
0x88 => (0xc3a0, 2), // lowercase a, grave accent
0x89 => (0x20, 1), // transparent space, we make it regular
0x8a => (0xc3a8, 2), // lowercase e, grave accent
0x8b => (0xc3a2, 2), // lowercase a, circumflex accent
0x8c => (0xc3aa, 2), // lowercase e, circumflex accent
0x8d => (0xc3ae, 2), // lowercase i, circumflex accent
0x8e => (0xc3b4, 2), // lowercase o, circumflex accent
0x8f => (0xc3bb, 2), // lowercase u, circumflex accent

// THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS
// THAT COME FROM HI BYTE=0x12 AND LOW BETWEEN 0x20 AND 0x3F
0x90 => 0xc1 as char, // capital letter A with acute
0x91 => 0xc9 as char, // capital letter E with acute
0x92 => 0xd3 as char, // capital letter O with acute
0x93 => 0xda as char, // capital letter U with acute
0x94 => 0xdc as char, // capital letter U with diaeresis
0x95 => 0xfc as char, // lowercase letter U with diaeresis
0x96 => 0x27 as char, // apostrophe
0x97 => 0xa1 as char, // inverted exclamation mark
0x98 => 0x2a as char, // asterisk
0x99 => 0x27 as char, // apostrophe (yes, duped). See CCADI source code.
0x9a => 0x2d as char, // em dash
0x9b => 0xa9 as char, // copyright sign
0x9c => '℠', // Service Mark
0x9d => 0x2e as char, // Full stop (.)
0x9e => 0x22 as char, // Quotation mark
0x9f => 0x22 as char, // Quotation mark
0xa0 => 0xc0 as char, // uppercase A, grave accent
0xa1 => 0xc2 as char, // uppercase A, circumflex
0xa2 => 0xc7 as char, // uppercase C with cedilla
0xa3 => 0xc8 as char, // uppercase E, grave accent
0xa4 => 0xca as char, // uppercase E, circumflex
0xa5 => 0xcb as char, // capital letter E with diaeresis
0xa6 => 0xeb as char, // lowercase letter e with diaeresis
0xa7 => 0xce as char, // uppercase I, circumflex
0xa8 => 0xcf as char, // uppercase I, with diaeresis
0xa9 => 0xef as char, // lowercase i, with diaeresis
0xaa => 0xd4 as char, // uppercase O, circumflex
0xab => 0xd9 as char, // uppercase U, grave accent
0xac => 0xf9 as char, // lowercase u, grave accent
0xad => 0xdb as char, // uppercase U, circumflex
0xae => 0xab as char, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
0xaf => 0xbb as char, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
0x90 => (0xc381, 2), // capital letter A with acute
0x91 => (0xc389, 2), // capital letter E with acute
0x92 => (0xc393, 2), // capital letter O with acute
0x93 => (0xc39a, 2), // capital letter U with acute
0x94 => (0xc39c, 2), // capital letter U with diaeresis
0x95 => (0xc3bc, 2), // lowercase letter U with diaeresis
0x96 => (0x27, 1), // apostrophe
0x97 => (0xc2a1, 2), // inverted exclamation mark
0x98 => (0x2a, 1), // asterisk
0x99 => (0x27, 1), // Plain single quote
0x9a => (0xe28094, 3), // em dash
0x9b => (0xc2a9, 2), // copyright sign
0x9c => (0xe284a0, 3), // Service mark
0x9d => (0xe280a2, 3), // Round bullet
0x9e => (0xe2809c, 3), // Opening double quotes
0x9f => (0xe2809d, 3), // Closing double quotes
0xa0 => (0xc380, 2), // uppercase A, grave accent
0xa1 => (0xc382, 2), // uppercase A, circumflex
0xa2 => (0xc387, 2), // uppercase C with cedilla
0xa3 => (0xc388, 2), // uppercase E, grave accent
0xa4 => (0xc38a, 2), // uppercase E, circumflex
0xa5 => (0xc38b, 2), // capital letter E with diaeresis
0xa6 => (0xc3ab, 2), // lowercase letter e with diaeresis
0xa7 => (0xc38e, 2), // uppercase I, circumflex
0xa8 => (0xc38f, 2), // uppercase I, with diaeresis
0xa9 => (0xc3af, 2), // lowercase i, with diaeresis
0xaa => (0xc394, 2), // uppercase O, circumflex
0xab => (0xc399, 2), // uppercase U, grave accent
0xac => (0xc3b9, 2), // lowercase u, grave accent
0xad => (0xc39b, 2), // uppercase U, circumflex
0xae => (0xc2ab, 2), // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
0xaf => (0xc2bb, 2), // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK

// THIS BLOCK INCLUDES THE 32 EXTENDED (TWO-BYTE) LINE 21 CHARACTERS
// THAT COME FROM HI BYTE=0x13 AND LOW BETWEEN 0x20 AND 0x3F
0xb0 => 0xc3 as char, // Uppercase A, tilde
0xb1 => 0xe3 as char, // Lowercase a, tilde
0xb2 => 0xcd as char, // Uppercase I, acute accent
0xb3 => 0xcc as char, // Uppercase I, grave accent
0xb4 => 0xec as char, // Lowercase i, grave accent
0xb5 => 0xd2 as char, // Uppercase O, grave accent
0xb6 => 0xf2 as char, // Lowercase o, grave accent
0xb7 => 0xd5 as char, // Uppercase O, tilde
0xb8 => 0xf5 as char, // Lowercase o, tilde
0xb9 => 0x7b as char, // Open curly brace
0xba => 0x7d as char, // Closing curly brace
0xbb => 0x5c as char, // Backslash
0xbc => 0x5e as char, // Caret
0xbd => 0x5f as char, // Underscore
0xbe => 0xa6 as char, // Pipe (broken bar)
0xbf => 0x7e as char, // Tilde
0xc0 => 0xc4 as char, // Uppercase A, umlaut
0xc1 => 0xe3 as char, // Lowercase A, umlaut
0xc2 => 0xd6 as char, // Uppercase O, umlaut
0xc3 => 0xf6 as char, // Lowercase o, umlaut
0xc4 => 0xdf as char, // Eszett (sharp S)
0xc5 => 0xa5 as char, // Yen symbol
0xc6 => 0xa4 as char, // Currency symbol
0xc7 => 0x7c as char, // Vertical bar
0xc8 => 0xc5 as char, // Uppercase A, ring
0xc9 => 0xe5 as char, // Lowercase A, ring
0xca => 0xd8 as char, // Uppercase O, slash
0xcb => 0xf8 as char, // Lowercase o, slash
0xcc => '⌜', // Top left corner
0xcd => '⌝', // Top right corner
0xce => '⌞', // Bottom left corner
0xcf => '⌟', // Bottom right corner
_ => UNAVAILABLE_CHAR as char, // For those that don't have representation
// I'll do it eventually, I promise
// This are weird chars anyway
0xb0 => (0xc383, 2), // Uppercase A, tilde
0xb1 => (0xc3a3, 2), // Lowercase a, tilde
0xb2 => (0xc38d, 2), // Uppercase I, acute accent
0xb3 => (0xc38c, 2), // Uppercase I, grave accent
0xb4 => (0xc3ac, 2), // Lowercase i, grave accent
0xb5 => (0xc392, 2), // Uppercase O, grave accent
0xb6 => (0xc3b2, 2), // Lowercase o, grave accent
0xb7 => (0xc395, 2), // Uppercase O, tilde
0xb8 => (0xc3b5, 2), // Lowercase o, tilde
0xb9 => (0x7b, 1), // Open curly brace
0xba => (0x7d, 1), // Closing curly brace
0xbb => (0x5c, 1), // Backslash
0xbc => (0x5e, 1), // Caret
0xbd => (0x5f, 1), // Underscore
0xbe => (0xc2a6, 2), // Pipe (broken bar)
0xbf => (0x7e, 1), // Tilde
0xc0 => (0xc384, 2), // Uppercase A, umlaut
0xc1 => (0xc3a4, 2), // Lowercase A, umlaut
0xc2 => (0xc396, 2), // Uppercase O, umlaut
0xc3 => (0xc3b6, 2), // Lowercase o, umlaut
0xc4 => (0xc39f, 2), // Esszett (sharp S)
0xc5 => (0xc2a5, 2), // Yen symbol
0xc6 => (0xc2a4, 2), // Currency symbol
0xc7 => (0x7c, 1), // Vertical bar
0xc8 => (0xc385, 2), // Uppercase A, ring
0xc9 => (0xc3a5, 2), // Lowercase A, ring
0xca => (0xc398, 2), // Uppercase O, slash
0xcb => (0xc3b8, 2), // Lowercase o, slash
0xcc => (0xe28c9c, 3), // Top left corner
0xcd => (0xe28c9d, 3), // Top right corner
0xce => (0xe28c9e, 3), // Bottom left corner
0xcf => (0xe28c9f, 3), // Bottom right corner
_ => (b'?' as u32, 1), // I'll do it eventually, I promise
// This are weird chars anyway
}
}
}
pub fn line21_to_char(c: Line21Char) -> char {
let (utf8_packed, byte_count) = line21_to_utf8(c);

// Extract bytes and create a UTF-8 string
let mut bytes = Vec::new();
match byte_count {
1 => bytes.push(utf8_packed as u8),
2 => {
bytes.push((utf8_packed >> 8) as u8);
bytes.push(utf8_packed as u8);
}
3 => {
bytes.push((utf8_packed >> 16) as u8);
bytes.push((utf8_packed >> 8) as u8);
bytes.push(utf8_packed as u8);
}
_ => return '?', // Invalid byte count
}

fn line21_to_ucs2(c: Line21Char) -> Ucs2Char {
// Convert UTF-8 bytes to char
match std::str::from_utf8(&bytes) {
Ok(s) => s.chars().next().unwrap_or('?'),
Err(_) => '?',
}
}
pub fn line21_to_ucs2(c: Line21Char) -> Ucs2Char {
match c {
0x7f => 0x25A0, // Solid block
0x84 => 0x2122, // Trademark symbol (TM)
Expand Down Expand Up @@ -905,7 +932,7 @@ fn ucs2_to_line21(c: Ucs2Char) -> Line21Char {
}
}

fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char {
pub fn ucs2_to_latin1(c: Ucs2Char) -> Latin1Char {
// Code points 0 to U+00FF are the same in both.
if c < 0xff {
c as u8
Expand Down Expand Up @@ -1004,6 +1031,6 @@ fn ucs2_to_char(c: Ucs2Char) -> char {
char::from_u32(x).unwrap_or(UNAVAILABLE_CHAR.into())
}

fn char_to_ucs2(c: char) -> Ucs2Char {
pub fn char_to_ucs2(c: char) -> Ucs2Char {
(c as u32).try_into().unwrap_or(UNAVAILABLE_CHAR.into())
}
Loading
Loading