Skip to content

Commit 6367aa6

Browse files
committed
AGL support and fix for bad CMaps
1 parent 7a78780 commit 6367aa6

File tree

4 files changed

+125
-26
lines changed

4 files changed

+125
-26
lines changed

src/PDFontTables.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,17 @@ const GlyphName_to_ZAPEncoding = reverse_dict(ZAPEncoding_to_GlyphName)
5555

5656
using AdobeGlyphList
5757

58-
function agl_mapping_to_dict(m)
58+
function agl_mapping_to_dict(m; fn=false)
5959
dict = Dict{CosName, Char}()
60-
map((@view m[:,1]), (@view m[:,2])) do x, y
60+
v1, v2 = fn ? (2, 1) : (1, 2)
61+
map((@view m[:,v1]), (@view m[:,v2])) do x, y
6162
dict[CosName(strip(x))] = y
6263
end
6364
return dict
6465
end
6566

6667
const AGL_Glyph_to_Unicode = agl_mapping_to_dict(agl())
68+
const AGLFN_Glyph_to_Unicode = agl_mapping_to_dict(aglfn(), fn=true)
6769
const AGL_ZAP_to_Unicode = agl_mapping_to_dict(zapfdingbats())
6870
const AGL_Unicode_to_Glyph = reverse_dict(AGL_Glyph_to_Unicode)
6971
const AGL_Unicode_to_ZAP = reverse_dict(AGL_ZAP_to_Unicode)

src/PDFonts.jl

Lines changed: 99 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ function show(io::IO, cmap::CMap)
4545
end
4646

4747

48-
const FontUnicodeMapping = Union{Dict{UInt8, Char}, CMap, Nothing}
48+
const FontUnicodeMapping = Union{Dict{UInt8, Vector{Char}}, CMap, Nothing}
4949

5050
#=
5151
mutable struct FontUnicodeMapping
@@ -56,7 +56,56 @@ mutable struct FontUnicodeMapping
5656
end
5757
=#
5858

59-
function merge_encoding!(fum::Dict{UInt8, Char}, encoding::CosName,
59+
function Base.merge!(fum::Dict{UInt8, Vector{Char}}, enc::Dict{UInt8, Char})
60+
for (k, v) in enc
61+
fum[k] = [v]
62+
end
63+
end
64+
65+
function get_agl_unicode(g::AbstractString)::Union{Vector{Char}, Char}
66+
r = r"u(?'u'[[:xdigit:]]+$)|uni(?'uni'[[:xdigit:]]{4,6}$)"
67+
m = match(r, g)
68+
if m !== nothing
69+
u, uni = m["u"], m["uni"]
70+
if u !== nothing
71+
l = length(u)
72+
if l > 3 && mod(l, 4) == 0
73+
ret = Char[]
74+
for i = 1:4:l
75+
c = parse(UInt16, SubString(u, i, i+3), base=16)
76+
0xE000 > c > 0xD7FF && break
77+
push!(ret, Char(c))
78+
end
79+
length(ret)*4 == l && return(ret)
80+
end
81+
else
82+
c = parse(UInt32, uni, base=16)
83+
0x0000 <= c <= 0xD7FF && 0xE000 <= c <= 0x10FFFF && return Char(c)
84+
end
85+
end
86+
cg = CosName(g)
87+
return get(AGL_Glyph_to_Unicode, cg, get(AGLFN_Glyph_to_Unicode, cg, zero(Char)))
88+
end
89+
90+
function get_unicodes_from_glyph_name(s::String)
91+
n = split(s, ".")
92+
nf = n[1]
93+
isempty(nf) && return [zero(Char)]
94+
gs = split(nf, "_")
95+
u = Char[]
96+
for g in gs
97+
append!(u, get_agl_unicode(g))
98+
end
99+
return u
100+
end
101+
102+
function merge_agl!(fum::Dict{UInt8, Vector{Char}}, d::Dict{UInt8, CosName})
103+
for (k, v) in d
104+
fum[k] = get_unicodes_from_glyph_name(String(v))
105+
end
106+
end
107+
108+
function merge_encoding!(fum::Dict{UInt8, Vector{Char}}, encoding::CosName,
60109
doc::CosDoc, font::IDDRef{CosDict})
61110
encoding_mapping =
62111
encoding == cn"WinAnsiEncoding" ? WINEncoding_to_Unicode :
@@ -82,10 +131,12 @@ function FontType(subtype::CosName)
82131
return FontDefType()
83132
end
84133

134+
# Entry point if someone wants to handle encoding based on subtype
135+
# By default maps to the default font unicode mapping.
85136
merge_encoding!(fum::FontUnicodeMapping, ftype::FontType,
86137
doc::CosDoc, font::IDDRef{CosDict}) = fum
87138

88-
function merge_encoding!(fum::Dict{UInt8, Char},
139+
function merge_encoding!(fum::Dict{UInt8, Vector{Char}},
89140
ftype::Union{FontType1, FontMMType1},
90141
doc::CosDoc, font::IDDRef{CosDict})
91142
basefont = cosDocGetObject(doc, font, cn"BaseFont")
@@ -104,14 +155,14 @@ end
104155
# Reading encoding from the font files in case of Symbolic fonts are not
105156
# supported.
106157
# Font subset is addressed with font name identification.
107-
function merge_encoding!(fum::Dict{UInt8, Char}, encoding::CosNullType,
158+
function merge_encoding!(fum::Dict{UInt8, Vector{Char}}, encoding::CosNullType,
108159
doc::CosDoc, font::IDDRef{CosDict})
109160
subtype = cosDocGetObject(doc, font, cn"Subtype")
110161
subtype === CosNull && return fum
111162
return merge_encoding!(fum, FontType(subtype), doc, font)
112163
end
113164

114-
function merge_encoding!(fum::Dict{UInt8, Char},
165+
function merge_encoding!(fum::Dict{UInt8, Vector{Char}},
115166
encoding::IDD{CosDict},
116167
doc::CosDoc, font::IDDRef{CosDict})
117168
baseenc = cosDocGetObject(doc, encoding, cn"BaseEncoding")
@@ -133,8 +184,7 @@ function merge_encoding!(fum::Dict{UInt8, Char},
133184
end
134185
end
135186

136-
dict_to_unicode = dict_remap(d, AGL_Glyph_to_Unicode)
137-
merge!(fum, dict_to_unicode)
187+
merge_agl!(fum, d)
138188
return fum
139189
end
140190

@@ -143,7 +193,7 @@ function get_unicode_mapping(doc::CosDoc, font::IDDRef{CosDict})
143193
toUnicode !== CosNull &&
144194
return get_unicode_mapping(toUnicode)
145195
encoding = cosDocGetObject(doc, font, cn"Encoding")
146-
d = merge_encoding!(Dict{UInt8, Char}(), encoding, doc, font)
196+
d = merge_encoding!(Dict{UInt8, Vector{Char}}(), encoding, doc, font)
147197
return length(d) == 0 ? nothing : d
148198
end
149199

@@ -218,11 +268,11 @@ function get_glyph_id_mapping(cosdoc::CosDoc, cosfont::IDD{CosDict})
218268
return glyph_name_to_cid, cid_to_glyph_name
219269
end
220270

221-
get_encoded_string(s::CosString, fum::Union{Dict{UInt8, Char}, CMap}) =
271+
get_encoded_string(s::CosString, fum::FontUnicodeMapping) =
222272
get_encoded_string(Vector{UInt8}(s), fum)
223273

224274
function get_encoded_string(v::Union{Vector{UInt8}, NTuple{N, UInt8}},
225-
fum::Dict{UInt8, Char}) where N
275+
fum::Dict{UInt8, Vector{Char}}) where N
226276
length(v) == 0 && return ""
227277
return String(NativeEncodingToUnicode(v, fum))
228278
end
@@ -334,6 +384,15 @@ cmap_command(b::Vector{UInt8}) =
334384
length(b), b != beginbfchar && b != beginbfrange && b != begincodespacerange ?
335385
nothing : Symbol(String(b))
336386

387+
function _offset(obj::CosXString, offset)
388+
da = Vector{UInt8}(obj)
389+
db = UInt16(da[1]*256+da[2]+offset)
390+
da[1], da[2] = UInt8(div(db, 256)), UInt8(mod(db, 256))
391+
io = IOBuffer()
392+
bytes2hex(io, da)
393+
return CosXString(take!(io))
394+
end
395+
337396
function on_cmap_command!(stm::IO, command::Symbol,
338397
params::Vector{CosInt}, cmap::CMap)
339398
n = get(pop!(params))
@@ -352,17 +411,36 @@ function on_cmap_command!(stm::IO, command::Symbol,
352411
if l == 1
353412
cmap.range_map[Interval(d1[1], d2[1])] = o3
354413
else
355-
imap = get!(cmap.range_map, Interval(d1[1], d2[1]),
356-
IntervalTree{UInt8, CosObject}())
357-
imap[Interval(d1[2], d2[2])] = o3
414+
if d1[2] <= d2[2]
415+
imap = get!(cmap.range_map, Interval(d1[1], d2[1]),
416+
IntervalTree{UInt8, CosObject}())
417+
imap[Interval(d1[2], d2[2])] = o3
418+
else
419+
@warn "Corrupt CMap file. Repairing... Some encodings may not map properly."
420+
imap = get!(cmap.range_map, Interval(d1[1], d1[1]),
421+
IntervalTree{UInt8, CosObject}())
422+
imap[Interval(d1[2], 0xff)] = o3
423+
o3 = _offset(o3, 0xff - d1[2] + 1)
424+
425+
if d2[1] - d1[1] > 1
426+
i1, i2 = d1[1]+0x1, d2[1]-0x1
427+
imap = get!(cmap.range_map, Interval(i1, i2),
428+
IntervalTree{UInt8, CosObject}())
429+
imap[Interval(0x00, 0xff)] = o3
430+
o3 = _offset(o3, (d2[1] - d1[1] - 1)*0x100)
431+
end
432+
imap = get!(cmap.range_map, Interval(d2[1], d2[1]),
433+
IntervalTree{UInt8, CosObject}())
434+
imap[Interval(0x00, d2[2])] = o3
435+
end
358436
end
359437
else
360438
l = length(d1)
361439
@assert (d1[1] <= d2[1]) E_INVALID_CODESPACERANGE
362440
if l == 1
363441
cmap.code_space[Interval(d1[1], d2[1])] = CosNull
364442
else
365-
if d1[2] < d2[2]
443+
if d1[2] <= d2[2]
366444
imap = IntervalTree{UInt8, CosNullType}()
367445
imap[Interval(d1[2], d2[2])] = CosNull
368446
cmap.code_space[Interval(d1[1], d2[1])] = imap
@@ -372,14 +450,16 @@ function on_cmap_command!(stm::IO, command::Symbol,
372450
imap[Interval(d1[2], 0xff)] = CosNull
373451
cmap.code_space[Interval(d1[1], d1[1])] = imap
374452

375-
imap = IntervalTree{UInt8, CosNullType}()
453+
imap = get!(cmap.code_space, Interval(d1[1], d1[1]), IntervalTree{UInt8, CosNullType}())
454+
imap[Interval(d1[2], 0xff)] = CosNull
455+
456+
imap = get!(cmap.code_space, Interval(d2[1], d2[1]), IntervalTree{UInt8, CosNullType}())
376457
imap[Interval(0x00, d2[2])] = CosNull
377-
cmap.code_space[Interval(d2[1], d2[1])] = imap
378458

379459
if d2[1] - d1[1] > 1
380-
imap = IntervalTree{UInt8, CosNullType}()
381-
imap[Interval(0x00, 0xff)] = CosNull
382-
cmap.code_space[Interval(d1[1]+1, d2[1]-1)] = imap
460+
i1, i2 = d1[1]+0x1, d2[1]-0x1
461+
imap = get!(cmap.code_space, Interval(i1, i2), IntervalTree{UInt8, CosNullType}())
462+
imap[Interval(0x00, 0xff)] = CosNull
383463
end
384464
end
385465
end

src/Utils.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,9 @@ end
8080
const Unicode_to_PDFEncoding = reverse_dict(PDFEncoding_to_Unicode)
8181

8282
function NativeEncodingToUnicode(barr, mapping::Dict)
83-
l = length(barr)
84-
carr = Vector{Char}(undef, l)
85-
for i = 1:l
86-
carr[i] = get(mapping, barr[i], zero(Char))
83+
carr = Vector{Char}()
84+
for b in barr
85+
append!(carr, get(mapping, b, zero(Char)))
8786
end
8887
return carr
8988
end

test/runtests.jl

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ using PDFIO.Common: read_pkcs12
1414

1515
include("debugIO.jl")
1616

17-
pdftest_ver = "0.0.10"
17+
pdftest_ver = "0.0.11"
1818
pdftest_link = "https://github.com/sambitdash/PDFTest/archive/v"*pdftest_ver
1919

2020
zipfile = "pdftest-"*pdftest_ver
@@ -436,6 +436,24 @@ local_files(filename, filesdir="files") = joinpath(@__DIR__, pdftest_dir, filesd
436436
@test all(get_encoded_string(UInt8[0x01, 0x08], cmap).== [Char(0x0110)])
437437
all(get_encoded_string(UInt8[0x00, 0xfb, 0x00, 0xfe], cmap) .== [Char(0x0106), Char(0x010D)])
438438
end
439+
440+
@test begin
441+
filename="bad1.cmap"
442+
DEBUG && println(filename)
443+
path = local_files(filename)
444+
io = util_open(path, "r")
445+
cmap = nothing
446+
try
447+
cmap = read_cmap(io)
448+
finally
449+
util_close(io)
450+
end
451+
@test cmap !== nothing
452+
453+
@test all(get_encoded_string(UInt8[0x00, 0xff], cmap).== [Char(0x013F)])
454+
@test all(get_encoded_string(UInt8[0x03, 0x00], cmap).== [Char(0x0340)])
455+
all(get_encoded_string(UInt8[0x00, 0xfb, 0x00, 0xfe], cmap) .== [Char(0x013B), Char(0x013E)])
456+
end
439457
end
440458

441459
@testset "Corrupt File" begin

0 commit comments

Comments
 (0)