Skip to content

Commit 0e0e661

Browse files
committed
Type 0 fonts with toUnicode CMap supported
1. Also fixes a bug for multiple content streams per page
1 parent a4fd6d6 commit 0e0e661

File tree

4 files changed

+120
-76
lines changed

4 files changed

+120
-76
lines changed

REQUIRE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ TimeZones
66
Documenter
77
LabelNumerals
88
IntervalTrees
9-
agl_aglfn
9+
StringEncodings

src/PDFontTables.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,13 @@ const Glyphname_to_MEXEncoding = reverse_dict(MEXEncoding_to_GlyphName)
5555
const GlyphName_to_SYMEncoding = reverse_dict(SYMEncoding_to_GlyphName)
5656
const GlyphName_to_ZAPEncoding = reverse_dict(ZAPEncoding_to_GlyphName)
5757

58-
if Pkg.installed("agl_aglfn") !== nothing
58+
# To be deleted when AGL gets registered.
5959

60-
using agl_aglfn
60+
if Pkg.installed("AdobeGlyphList") === nothing
61+
Pkg.clone("https://github.com/sambitdash/AdobeGlyphList.jl.git")
62+
end
63+
64+
using AdobeGlyphList
6165

6266
function agl_mapping_to_dict(m)
6367
dict = Dict{CosName, Char}()
@@ -79,5 +83,3 @@ const WINEncoding_to_Unicode = dict_remap(WINEncoding_to_GlyphName, AGL_Glyph_to
7983
const MEXEncoding_to_Unicode = dict_remap(MEXEncoding_to_GlyphName, AGL_Glyph_to_Unicode)
8084
const SYMEncoding_to_Unicode = dict_remap(SYMEncoding_to_GlyphName, AGL_Glyph_to_Unicode)
8185
const ZAPEncoding_to_Unicode = dict_remap(ZAPEncoding_to_GlyphName, AGL_ZAP_to_Unicode)
82-
83-
end

src/PDFonts.jl

Lines changed: 110 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,28 @@ const endbfrange = b"endbfrange"
1515
const begincodespacerange = b"begincodespacerange"
1616
const endcodespacerange = b"endcodespacerange"
1717

18+
1819
mutable struct CMap
1920
code_space::IntervalMap{UInt8, Union{CosNullType, IntervalMap{UInt8, CosNullType}}}
2021
range_map::IntervalMap{UInt8, Union{CosObject, IntervalMap{UInt8, CosObject}}}
2122
CMap() = new(IntervalMap{UInt8, Union{CosNullType, IntervalMap{UInt8, CosNullType}}}(),
2223
IntervalMap{UInt8, Union{CosObject, IntervalMap{UInt8, CosObject}}}())
2324
end
2425

26+
#=
27+
mutable struct CMap
28+
code_space::Array{UInt8,2}
29+
CMap() = new(zeros(UInt8, 256, 3))
30+
end
31+
=#
32+
33+
function show(io::IO, cmap::CMap)
34+
show(io, "Code Space:\n")
35+
show(io, cmap.code_space)
36+
show(io, "Range Map:\n")
37+
show(io, cmap.range_map)
38+
end
39+
2540
mutable struct FontUnicodeMapping
2641
encoding::Dict
2742
cmap::CMap
@@ -52,7 +67,7 @@ function merge_encoding!(fum::FontUnicodeMapping, encoding::CosNullType,
5267
basefont_with_subset = CDTextString(basefont)
5368
basefont_str = rsplit(basefont_with_subset, '+';limit=2)[end]
5469
enc = (basefont_str == "Symbol") ? SYMEncoding_to_Unicode :
55-
(basefont_str == "ZapfDigbats") ? ZAPEncoding_to_Unicode :
70+
(basefont_str == "ZapfDingbats") ? ZAPEncoding_to_Unicode :
5671
STDEncoding_to_Unicode
5772
merge!(fum.encoding, enc)
5873
return fum
@@ -86,14 +101,15 @@ end
86101
function merge_encoding!(fum::FontUnicodeMapping, doc::CosDoc, font::CosObject)
87102
encoding = cosDocGetObject(doc, font, cn"Encoding")
88103
merge_encoding!(fum, encoding, doc, font)
89-
# toUnicode = cosDocGetObject(doc, font, cn"ToUnicode")
90-
# toUnicode == CosNull && return fum
91-
# merge_encoding!(fum, toUnicode, doc, font)
104+
toUnicode = cosDocGetObject(doc, font, cn"ToUnicode")
105+
toUnicode == CosNull && return fum
106+
merge_encoding!(fum, toUnicode, doc, font)
92107
end
93108

94109
function merge_encoding!(fum::FontUnicodeMapping, cmap::CosIndirectObject{CosStream},
95110
doc::CosDoc, font::CosObject)
96-
fum.toUnicode = read_cmap(get(cmap))
111+
fum.cmap = read_cmap(get(cmap))
112+
fum.hasCMap = true
97113
return fum
98114
end
99115

@@ -105,18 +121,95 @@ function get_encoded_string(s::CosString, fum::FontUnicodeMapping)
105121
return String(carr)
106122
end
107123

124+
function get_unicode_chars(b::UInt8, itv::IntervalValue{UInt8, CosObject})
125+
f = first(itv)
126+
l = last(itv)
127+
v = value(itv)
128+
if v isa CosXString
129+
bytes = Vector{UInt8}(v)
130+
carr = get_unicode_chars(bytes)
131+
carr[1] += (b - f) # Only one char should be generated here
132+
else
133+
@assert v isa CosArray
134+
arr = get(v)
135+
xstr = arr[b - f + 1]
136+
@assert xstr isa CosXString
137+
bytes = Vector{UInt8}(xstr)
138+
carr = get_unicode_chars(bytes)
139+
end
140+
return carr
141+
end
142+
143+
function get_unicode_chars(barr::Vector{UInt8})
144+
l = length(barr)
145+
nb = 0
146+
retarr = Vector{Char}()
147+
while nb < l
148+
b1 = barr[1]
149+
b2 = barr[2]
150+
nb += 2
151+
c::UInt32 = 0
152+
if 0xD8 <= b1 <= 0xDB
153+
# UTF-16 Supplementary plane = 4 bytes
154+
b1 -= 0xD8
155+
c = b1
156+
c = (c << 8) + b2
157+
b3 = barr[3]
158+
b4 = barr[4]
159+
nb += 2
160+
if 0xDC <= b3 <= 0xDF
161+
b3 -= 0xDC
162+
c1 = b3
163+
c1 = (c1 << 8) + b4
164+
c = (c << 10) + c1
165+
c += 0x10000
166+
end
167+
else
168+
c = b1
169+
c = (c << 8) + b2
170+
end
171+
push!(retarr, Char(c))
172+
end
173+
return retarr
174+
end
175+
108176
# Placeholder only
109-
get_encoded_string(s::CosString, cmap::CMap) = CDTextString(s)
177+
function get_encoded_string(s::CosString, cmap::CMap)
178+
cs = cmap.code_space
179+
rm = cmap.range_map
180+
barr = Vector{UInt8}(s)
181+
l = length(barr)
182+
b1 = b2 = 0x0
183+
carr = Vector{Char}()
184+
retarr = Vector{Char}()
185+
i = 0
186+
while i < l
187+
b1 = barr[i+=1]
188+
if hasintersection(cs, b1)
189+
itree = value(collect(intersect(cs, (b1,b1)))[1])
190+
if itree === CosNull
191+
itv = collect(intersect(rm, (b1,b1)))[1]
192+
carr = get_unicode_chars(b1, itv)
193+
else
194+
b2 = barr[i+=1]
195+
itree1 = value(collect(intersect(rm, (b1,b1)))[1])
196+
itv = collect(intersect(itree1, (b2,b2)))[1]
197+
carr = get_unicode_chars(b2, itv)
198+
end
199+
append!(retarr, carr)
200+
end
201+
end
202+
return retarr
203+
end
110204

111205
function cmap_command(b::Vector{UInt8})
112206
b != beginbfchar && b != beginbfrange && b != begincodespacerange && return nothing
113207
return Symbol(String(b))
114208
end
115209

116-
function on_cmap_command(stm::BufferedInputStream, command::Symbol,
210+
function on_cmap_command!(stm::BufferedInputStream, command::Symbol,
117211
params::Vector{CosInt}, cmap::CMap)
118212
n = get(pop!(params))
119-
println(n)
120213
o1, o2, o3 = CosNull, CosNull, CosNull
121214
for i = 1:n
122215
o1 = parse_value(stm)
@@ -128,14 +221,17 @@ function on_cmap_command(stm::BufferedInputStream, command::Symbol,
128221
if (command != :begincodespacerange)
129222
o3 = parse_value(stm)
130223
@assert isa(o3, CosXString) || isa(o3, CosArray)
131-
println(d1)
132224
l = length(d1)
133225
if l == 1
134226
cmap.range_map[(d1[1],d2[1])] = o3
135227
else
136-
imap = IntervalMap{UInt8, CosObject}()
228+
if hasintersection(cmap.range_map, d1[1])
229+
imap = value(collect(intersect(cmap.range_map, (d1[1], d2[1])))[1])
230+
else
231+
imap = IntervalMap{UInt8, CosObject}()
232+
cmap.range_map[(d1[1],d2[1])] = imap
233+
end
137234
imap[(d1[2], d2[2])] = o3
138-
cmap.range_map[(d1[1],d2[1])] = imap
139235
end
140236
else
141237
l = length(d1)
@@ -148,9 +244,10 @@ function on_cmap_command(stm::BufferedInputStream, command::Symbol,
148244
end
149245
end
150246
end
247+
return cmap
151248
end
152249

153-
on_cmap_command(stm::BufferedInputStream, command::CosObject,
250+
on_cmap_command!(stm::BufferedInputStream, command::CosObject,
154251
params::Vector{CosInt}, cmap::CMap) = nothing
155252

156253
function read_cmap(stm::BufferedInputStream)
@@ -162,62 +259,7 @@ function read_cmap(stm::BufferedInputStream)
162259
push!(params, obj)
163260
end
164261
(obj == :beginbfchar || obj == :beginbfrange || obj == :begincodespacerange) &&
165-
on_cmap_command(stm, obj, params, tcmap)
262+
on_cmap_command!(stm, obj, params, tcmap)
166263
end
167264
return tcmap
168265
end
169-
170-
#=
171-
function get_encoded_string(s::CosXString, cmap::CosObject)
172-
cmap_vec = read_cmap(cmap)
173-
hexbytes = get(s)
174-
data = hexbytes |> String |> hex2bytes
175-
176-
cmap_len = length(cmap_vec)
177-
178-
for i = 1:cmap_len
179-
nb = cmap_vec[i][1]
180-
end
181-
182-
for b in data
183-
#if b in
184-
end
185-
state = start(cmap_vec)
186-
nbytes = []
187-
while !done(cmap_vec, state)
188-
(r, state) = next(cmap_vec, state)
189-
isa(r[2], CosInt) && push!(nbytes, Int(r[2]))
190-
end
191-
for r in cmap_vec
192-
if isa(r[1], CosInt)
193-
end
194-
i = 1
195-
len = length(data)
196-
retval = UInt16[]
197-
while i < len
198-
c = parse(UInt16, String(data[i:i+3]), 16)
199-
for r in cmap_range
200-
range = r[1]
201-
if c in range
202-
incr = c - range[1]
203-
v = r[2]
204-
if isa(v, CosXString)
205-
data2 = get(v)
206-
c2 = parse(UInt16, String(data2), 16)
207-
c2 += incr
208-
push!(retval, c2)
209-
elseif isa(v, CosArray)
210-
data2 = get(v)[incr+1]
211-
j = 1
212-
while j < length(data2)
213-
c2 = parse(UInt16, String(data2[j:j+3]), 16)
214-
push!(retval, c2)
215-
j += 4
216-
end
217-
end
218-
end
219-
end
220-
i += 4
221-
end
222-
end
223-
=#

src/PDPage.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,10 @@ get_page_content_ref(page::PDPageImpl) = get(page.cospage, cn"Contents")
103103
function get_page_contents(page::PDPageImpl, contents::CosArray)
104104
len = length(contents)
105105
for i = 1:len
106-
ref = splice!(contents, 1)
107-
cosstm = get_page_contents(page.doc.cosDoc,ref)
106+
ref = splice!(get(contents), 1)
107+
cosstm = get_page_contents(page,ref)
108108
if (cosstm != CosNull)
109-
push!(contents,cosstm)
109+
push!(get(contents),cosstm)
110110
end
111111
end
112112
return contents

0 commit comments

Comments
 (0)