Skip to content

Commit ff16f3f

Browse files
authored
Refine linebreak algorithm for better Chinese justification (typst#701)
1 parent 03d2ec9 commit ff16f3f

File tree

8 files changed

+187
-63
lines changed

8 files changed

+187
-63
lines changed

library/src/layout/par.rs

Lines changed: 83 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -457,22 +457,35 @@ impl<'a> Line<'a> {
457457
self.items().skip(start).take(end - start)
458458
}
459459

460-
/// How many justifiable glyphs the line contains.
460+
/// How many glyphs are in the text where we can insert additional
461+
/// space when encountering underfull lines.
461462
fn justifiables(&self) -> usize {
462463
let mut count = 0;
463464
for shaped in self.items().filter_map(Item::text) {
464465
count += shaped.justifiables();
465466
}
467+
// CJK character at line end should not be adjusted.
468+
if self
469+
.items()
470+
.last()
471+
.and_then(Item::text)
472+
.map(|s| s.cjk_justifiable_at_last())
473+
.unwrap_or(false)
474+
{
475+
count -= 1;
476+
}
477+
466478
count
467479
}
468480

469-
/// How much of the line is stretchable spaces.
470-
fn stretch(&self) -> Abs {
471-
let mut stretch = Abs::zero();
472-
for shaped in self.items().filter_map(Item::text) {
473-
stretch += shaped.stretch();
474-
}
475-
stretch
481+
/// How much can the line stretch
482+
fn stretchability(&self) -> Abs {
483+
self.items().filter_map(Item::text).map(|s| s.stretchability()).sum()
484+
}
485+
486+
/// How much can the line shrink
487+
fn shrinkability(&self) -> Abs {
488+
self.items().filter_map(Item::text).map(|s| s.shrinkability()).sum()
476489
}
477490

478491
/// The sum of fractions in the line.
@@ -835,10 +848,9 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
835848

836849
// Cost parameters.
837850
const HYPH_COST: Cost = 0.5;
838-
const CONSECUTIVE_DASH_COST: Cost = 30.0;
851+
const CONSECUTIVE_DASH_COST: Cost = 300.0;
839852
const MAX_COST: Cost = 1_000_000.0;
840-
const MIN_COST: Cost = -MAX_COST;
841-
const MIN_RATIO: f64 = -0.15;
853+
const MIN_RATIO: f64 = -1.0;
842854

843855
// Dynamic programming table.
844856
let mut active = 0;
@@ -864,14 +876,31 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
864876
// Determine how much the line's spaces would need to be stretched
865877
// to make it the desired width.
866878
let delta = width - attempt.width;
867-
let mut ratio = delta / attempt.stretch();
879+
// Determine how much stretch are permitted.
880+
let adjust = if delta >= Abs::zero() {
881+
attempt.stretchability()
882+
} else {
883+
attempt.shrinkability()
884+
};
885+
// Ideally, the ratio should between -1.0 and 1.0, but sometimes a value above 1.0
886+
// is possible, in which case the line is underfull.
887+
let mut ratio = delta / adjust;
888+
if ratio.is_nan() {
889+
// The line is not stretchable, but it just fits.
890+
// This often happens with monospace fonts and CJK texts.
891+
ratio = 0.0;
892+
}
868893
if ratio.is_infinite() {
894+
// The line's not stretchable, we calculate the ratio in another way...
869895
ratio = delta / (em / 2.0);
896+
// ...and because it is underfull/overfull, make sure the ratio is at least 1.0.
897+
if ratio > 0.0 {
898+
ratio += 1.0;
899+
} else {
900+
ratio -= 1.0;
901+
}
870902
}
871903

872-
// At some point, it doesn't matter any more.
873-
ratio = ratio.min(10.0);
874-
875904
// Determine the cost of the line.
876905
let min_ratio = if attempt.justify { MIN_RATIO } else { 0.0 };
877906
let mut cost = if ratio < min_ratio {
@@ -883,11 +912,15 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
883912
active = i + 1;
884913
MAX_COST
885914
} else if mandatory || eof {
886-
// This is a mandatory break and the line is not overfull, so it
887-
// has minimum cost. All breakpoints before this one become
888-
// inactive since no line can span above the mandatory break.
915+
// This is a mandatory break and the line is not overfull, so
916+
// all breakpoints before this one become inactive since no line
917+
// can span above the mandatory break.
889918
active = k;
890-
MIN_COST + if attempt.justify { ratio.powi(3).abs() } else { 0.0 }
919+
if attempt.justify {
920+
ratio.powi(3).abs()
921+
} else {
922+
0.0
923+
}
891924
} else {
892925
// Normal line with cost of |ratio^3|.
893926
ratio.powi(3).abs()
@@ -898,6 +931,12 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
898931
cost += HYPH_COST;
899932
}
900933

934+
// In Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a,
935+
// where r is the ratio, p=50 is penaty, and a=3000 is consecutive penaty.
936+
// We divide the whole formula by 10, resulting (0.01 + |r|^3 + p)^2 + a,
937+
// where p=0.5 and a=300
938+
cost = (0.01 + cost).powi(2);
939+
901940
// Penalize two consecutive dashes (not necessarily hyphens) extra.
902941
if attempt.dash && pred.line.dash {
903942
cost += CONSECUTIVE_DASH_COST;
@@ -1233,13 +1272,32 @@ fn commit(
12331272
}
12341273
}
12351274

1236-
// Determine how much to justify each space.
1275+
// Determine how much addtional space is needed.
1276+
// The justicication_ratio is for the first step justification,
1277+
// extra_justification is for the last step.
1278+
// For more info on multi-step justification, see Procedures for Inter-
1279+
// Character Space Expansion in W3C document Chinese Layout Requirements.
12371280
let fr = line.fr();
1238-
let mut justification = Abs::zero();
1239-
if remaining < Abs::zero() || (line.justify && fr.is_zero()) {
1281+
let mut justification_ratio = 0.0;
1282+
let mut extra_justification = Abs::zero();
1283+
1284+
let shrink = line.shrinkability();
1285+
let stretch = line.stretchability();
1286+
if remaining < Abs::zero() && shrink > Abs::zero() {
1287+
// Attempt to reduce the length of the line, using shrinkability.
1288+
justification_ratio = (remaining / shrink).max(-1.0);
1289+
remaining = (remaining + shrink).min(Abs::zero());
1290+
} else if line.justify && fr.is_zero() {
1291+
// Attempt to increase the length of the line, using stretchability.
1292+
if stretch > Abs::zero() {
1293+
justification_ratio = (remaining / stretch).min(1.0);
1294+
remaining = (remaining - stretch).max(Abs::zero());
1295+
}
1296+
12401297
let justifiables = line.justifiables();
1241-
if justifiables > 0 {
1242-
justification = remaining / justifiables as f64;
1298+
if justifiables > 0 && remaining > Abs::zero() {
1299+
// Underfull line, distribute the extra space.
1300+
extra_justification = remaining / justifiables as f64;
12431301
remaining = Abs::zero();
12441302
}
12451303
}
@@ -1275,7 +1333,7 @@ fn commit(
12751333
}
12761334
}
12771335
Item::Text(shaped) => {
1278-
let frame = shaped.build(vt, justification);
1336+
let frame = shaped.build(vt, justification_ratio, extra_justification);
12791337
push(&mut offset, frame);
12801338
}
12811339
Item::Frame(frame) => {

library/src/text/shaping.rs

Lines changed: 81 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -70,22 +70,42 @@ impl ShapedGlyph {
7070
}
7171

7272
/// Whether the glyph is justifiable.
73-
///
74-
/// Typst's basic justification strategy is to stretch all the spaces
75-
/// in a line until the line fills the available width. However, some
76-
/// scripts (notably Chinese and Japanese) don't use spaces.
77-
///
78-
/// In Japanese typography, the convention is to insert space evenly
79-
/// between all glyphs. I assume it's the same in Chinese.
8073
pub fn is_justifiable(&self) -> bool {
81-
self.is_space() || is_spaceless(self.c.script())
74+
self.is_space() || self.is_cjk() || self.is_cjk_punctuation()
75+
}
76+
77+
pub fn is_cjk(&self) -> bool {
78+
use Script::*;
79+
matches!(self.c.script(), Hiragana | Katakana | Han)
80+
}
81+
82+
pub fn is_cjk_punctuation(&self) -> bool {
83+
matches!(self.c, ',' | '。' | '、' | ':' | ';')
84+
}
85+
86+
/// The stretchability of the character.
87+
pub fn stretchability(&self) -> Em {
88+
let width = self.x_advance;
89+
if self.is_space() {
90+
// The number for spaces is from Knuth-Plass' paper
91+
width / 2.0
92+
} else {
93+
Em::zero()
94+
}
8295
}
83-
}
8496

85-
/// Does this script separate its words using spaces?
86-
fn is_spaceless(script: Script) -> bool {
87-
use Script::*;
88-
matches!(script, Hiragana | Katakana | Han)
97+
/// The shrinkability of the character.
98+
pub fn shrinkability(&self) -> Em {
99+
let width = self.x_advance;
100+
if self.is_space() {
101+
// The number for spaces is from Knuth-Plass' paper
102+
width / 3.0
103+
} else if self.is_cjk_punctuation() {
104+
width / 2.0
105+
} else {
106+
Em::zero()
107+
}
108+
}
89109
}
90110

91111
/// A side you can go toward.
@@ -101,7 +121,12 @@ impl<'a> ShapedText<'a> {
101121
///
102122
/// The `justification` defines how much extra advance width each
103123
/// [justifiable glyph](ShapedGlyph::is_justifiable) will get.
104-
pub fn build(&self, vt: &Vt, justification: Abs) -> Frame {
124+
pub fn build(
125+
&self,
126+
vt: &Vt,
127+
justification_ratio: f64,
128+
extra_justification: Abs,
129+
) -> Frame {
105130
let (top, bottom) = self.measure(vt);
106131
let size = Size::new(self.width, top + bottom);
107132

@@ -120,19 +145,25 @@ impl<'a> ShapedText<'a> {
120145
let pos = Point::new(offset, top + shift - y_offset.at(self.size));
121146
let glyphs = group
122147
.iter()
123-
.map(|glyph| Glyph {
124-
id: glyph.glyph_id,
125-
x_advance: glyph.x_advance
126-
+ if glyph.is_justifiable() {
127-
frame.size_mut().x += justification;
128-
Em::from_length(justification, self.size)
129-
} else {
130-
Em::zero()
131-
},
132-
x_offset: glyph.x_offset,
133-
c: glyph.c,
134-
span: glyph.span,
135-
offset: glyph.offset,
148+
.map(|glyph| {
149+
let mut justification = Em::zero();
150+
if justification_ratio < 0.0 {
151+
justification += glyph.shrinkability() * justification_ratio
152+
} else {
153+
justification += glyph.stretchability() * justification_ratio
154+
}
155+
if glyph.is_justifiable() {
156+
justification += Em::from_length(extra_justification, self.size)
157+
}
158+
frame.size_mut().x += justification.at(self.size);
159+
Glyph {
160+
id: glyph.glyph_id,
161+
x_advance: glyph.x_advance + justification,
162+
x_offset: glyph.x_offset,
163+
c: glyph.c,
164+
span: glyph.span,
165+
offset: glyph.offset,
166+
}
136167
})
137168
.collect();
138169

@@ -200,17 +231,35 @@ impl<'a> ShapedText<'a> {
200231
(top, bottom)
201232
}
202233

203-
/// How many justifiable glyphs the text contains.
234+
/// How many glyphs are in the text where we can insert additional
235+
/// space when encountering underfull lines.
204236
pub fn justifiables(&self) -> usize {
205237
self.glyphs.iter().filter(|g| g.is_justifiable()).count()
206238
}
207239

208-
/// The width of the spaces in the text.
209-
pub fn stretch(&self) -> Abs {
240+
/// Whether the last glyph is a CJK character which should not be justified
241+
/// on line end.
242+
pub fn cjk_justifiable_at_last(&self) -> bool {
243+
self.glyphs
244+
.last()
245+
.map(|g| g.is_cjk() || g.is_cjk_punctuation())
246+
.unwrap_or(false)
247+
}
248+
249+
/// The stretchability of the text.
250+
pub fn stretchability(&self) -> Abs {
251+
self.glyphs
252+
.iter()
253+
.map(|g| g.stretchability())
254+
.sum::<Em>()
255+
.at(self.size)
256+
}
257+
258+
/// The shrinkability of the text
259+
pub fn shrinkability(&self) -> Abs {
210260
self.glyphs
211261
.iter()
212-
.filter(|g| g.is_justifiable())
213-
.map(|g| g.x_advance)
262+
.map(|g| g.shrinkability())
214263
.sum::<Em>()
215264
.at(self.size)
216265
}

tests/ref/layout/par-justify-cjk.png

42.9 KB
Loading

tests/ref/layout/par-justify.png

-16.8 KB
Loading

tests/ref/layout/par-knuth.png

-683 Bytes
Loading

tests/ref/text/linebreak.png

-169 Bytes
Loading

tests/typ/layout/par-justify-cjk.typ

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Test Chinese text in narrow lines.
2+
3+
// In Chinese typography, line length should be multiples of the character size
4+
// and the line ends should be aligned with each other.
5+
// Most Chinese publications do not use hanging punctuation at line end.
6+
#set page(width: auto)
7+
#set par(justify: true)
8+
#set text(overhang: false, lang: "zh")
9+
10+
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
11+
中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
12+
]
13+
14+
---
15+
// Japanese typography is more complex, make sure it is at least a bit sensible.
16+
#set page(width: auto)
17+
#set par(justify: true)
18+
#set text(lang: "jp")
19+
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
20+
ウィキペディア(英: Wikipedia)は、世界中のボランティアの共同作業によって執筆及び作成されるフリーの多言語インターネット百科事典である。主に寄付に依って活動している非営利団体「ウィキメディア財団」が所有・運営している。
21+
22+
専門家によるオンライン百科事典プロジェクトNupedia(ヌーペディア)を前身として、2001年1月、ラリー・サンガーとジミー・ウェールズ(英: Jimmy Donal "Jimbo" Wales)により英語でプロジェクトが開始された。
23+
]

tests/typ/layout/par-justify.typ

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,6 @@ D
2020
A B C #linebreak(justify: true)
2121
D E F #linebreak(justify: true)
2222

23-
---
24-
// Test that justificating chinese text is at least a bit sensible.
25-
#set page(width: 200pt)
26-
#set par(justify: true)
27-
中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
28-
2923
---
3024
// Test that there are no hick-ups with justification enabled and
3125
// basically empty paragraph.

0 commit comments

Comments
 (0)