Skip to content

Commit a812f03

Browse files
authored
Define textwidth for overlong chars (#58602)
Previously, this would error. There is no guarantee of how terminals render overlong encodings. Some terminals does not print them at all, and some print "�". Here, we set a textwidth of 1, conservatively. Refs #58593
1 parent 7865349 commit a812f03

File tree

2 files changed

+9
-9
lines changed

2 files changed

+9
-9
lines changed

base/strings/unicode.jl

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ module Unicode
66
import Base: show, ==, hash, string, Symbol, isless, length, eltype,
77
convert, isvalid, ismalformed, isoverlong, iterate,
88
AnnotatedString, AnnotatedChar, annotated_chartransform,
9-
@assume_effects, annotations
9+
@assume_effects, annotations, is_overlong_enc
1010

1111
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
1212

@@ -262,17 +262,15 @@ julia> textwidth('⛵')
262262
2
263263
```
264264
"""
265-
function textwidth(c::AbstractChar)
266-
ismalformed(c) && return 1
267-
i = codepoint(c)
268-
i < 0x7f && return Int(i >= 0x20) # ASCII fast path
269-
Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i))
270-
end
265+
textwidth(c::AbstractChar) = textwidth(Char(c)::Char)
271266

272267
function textwidth(c::Char)
273-
b = bswap(reinterpret(UInt32, c)) # from isascii(c)
268+
u = reinterpret(UInt32, c)
269+
b = bswap(u) # from isascii(c)
274270
b < 0x7f && return Int(b >= 0x20) # ASCII fast path
275-
ismalformed(c) && return 1
271+
# We can't know a priori how terminals will render invalid UTF8 chars,
272+
# so we conservatively decide a width of 1.
273+
(ismalformed(c) || is_overlong_enc(u)) && return 1
276274
Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
277275
end
278276

test/strings/util.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ SubStr(s) = SubString("abc$(s)de", firstindex(s) + 3, lastindex(s) + 3)
88
@test textwidth(c^3) == w*3
99
@test w == @invoke textwidth(c::AbstractChar)
1010
end
11+
@test textwidth('\xc0\xa0') == 1 # overlong
12+
@test textwidth('\xf0\x80\x80') == 1 # malformed
1113
for i in 0x00:0x7f # test all ASCII chars (which have fast path)
1214
w = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i))
1315
c = Char(i)

0 commit comments

Comments
 (0)