Skip to content

Commit a4fd6d6

Browse files
committed
Refactoring code and moving all encoding methods
into PDFonts.jl
1 parent 2b9a0df commit a4fd6d6

File tree

5 files changed

+192
-178
lines changed

5 files changed

+192
-178
lines changed

431.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/PDFonts.jl

Lines changed: 88 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -2,108 +2,11 @@ import ..Cos: CosXString
22

33
using IntervalTrees
44

5-
# This is a very crude method to read a CMap. Standards compliant CMap reader can be very
6-
# involved. This is a quick and dirty way to extract the encoding information.
7-
#= Sample ToUnicode C-Map
8-
9-
/CIDInit /ProcSet findresource begin
10-
18 dict begin
11-
begincmap
12-
/CIDSystemInfo
13-
<< /Registry (Adobe)
14-
/Ordering (UCS)
15-
/Supplement 0
16-
>> def
17-
/CMapName /Adobe-Identity-UCS def
18-
/CMapType 2 def
19-
1 begincodespacerange
20-
<0000> <FFFF>
21-
endcodespacerange
22-
1 beginbfchar
23-
<0003> <0020>
24-
endbfchar
25-
1 beginbfrange
26-
<000B> <000C> <0028>
27-
endbfrange
28-
2 beginbfchar
29-
<000F> <002C>
30-
<0011> <002E>
31-
endbfchar
32-
3 beginbfrange
33-
<0013> <001C> <0030>
34-
<0024> <0027> <0041>
35-
<0029> <002A> <0046>
36-
endbfrange
37-
1 beginbfchar
38-
<002C> <0049>
39-
endbfchar
40-
2 beginbfrange
41-
<0031> <0033> <004E>
42-
<0035> <0037> <0052>
43-
endbfrange
44-
1 beginbfchar
45-
<0039> <0056>
46-
endbfchar
47-
4 beginbfrange
48-
<0044> <0053> <0061>
49-
<0055> <005C> <0072>
50-
<00B2> <00B2> [<2014>]
51-
<00B3> <00B4> <201C>
52-
endbfrange
53-
1 beginbfchar
54-
<00B6> <2019>
55-
endbfchar
56-
endcmap
57-
CMapName currentdict /CMap defineresource pop
58-
end
59-
end
60-
61-
Single byte ToUnicode CMap
62-
63-
/CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo <<
64-
/Registry (F15+0) /Ordering (T1UV) /Supplement 0 >> def
65-
/CMapName /F15+0 def
66-
/CMapType 2 def
67-
1 begincodespacerange <01> <c9> endcodespacerange
68-
18 beginbfchar
69-
<05> <260E>
70-
<0a> <261B>
71-
<0b> <261E>
72-
<20> <0020>
73-
<29> <2605>
74-
<4d> <25CF>
75-
<4e> <274D>
76-
<4f> <25A0>
77-
<54> <25B2>
78-
<55> <25BC>
79-
<56> <25C6>
80-
<57> <2756>
81-
<58> <25D7>
82-
<75> <2663>
83-
<76> <2666>
84-
<77> <2665>
85-
<78> <2660>
86-
<a2> <2192>
87-
endbfchar
88-
15 beginbfrange
89-
<01> <04> <2701>
90-
<06> <09> <2706>
91-
<0c> <1f> <270C>
92-
<21> <28> <2720>
93-
<2a> <4c> <2729>
94-
<50> <53> <274F>
95-
<59> <5f> <2758>
96-
<60> <6d> <F8D7>
97-
<6e> <74> <2761>
98-
<79> <82> <2460>
99-
<83> <a1> <2776>
100-
<a3> <a4> <2194>
101-
<a5> <a7> <2798>
102-
<a8> <bb> <279C>
103-
<bc> <c9> <27B1>
104-
endbfrange
105-
endcmap CMapName currentdict /CMap defineresource pop end end
5+
#=
6+
Sample CMaps are available now as 8.cmap and 16.cmap in the test/files directory for 8 and
7+
16-bit toUnicode CMaps.
1068
9+
CMaps can have both 8 and 16 bit ranges in the same CMap file as well.
10710
=#
10811
const beginbfchar = b"beginbfchar"
10912
const endbfchar = b"endbfchar"
@@ -119,12 +22,92 @@ mutable struct CMap
11922
IntervalMap{UInt8, Union{CosObject, IntervalMap{UInt8, CosObject}}}())
12023
end
12124

122-
mutable struct PDFont
25+
mutable struct FontUnicodeMapping
12326
encoding::Dict
124-
toUnicode::CMap
125-
PDFont() = new(Dict(), CMap())
27+
cmap::CMap
28+
hasCMap::Bool
29+
FontUnicodeMapping() = new(Dict(), CMap(), false)
30+
end
31+
32+
function merge_encoding!(fum::FontUnicodeMapping, encoding::CosName,
33+
doc::CosDoc, font::CosObject)
34+
encoding_mapping = encoding == cn"WinAnsiEncoding" ? WINEncoding_to_Unicode :
35+
encoding == cn"MacRomanEncoding" ? MACEncoding_to_Unicode :
36+
encoding == cn"MacExpertEncoding" ? MEXEncoding_to_Unicode :
37+
STDEncoding_to_Unicode
38+
merge!(fum.encoding, encoding_mapping)
39+
return fum
12640
end
12741

42+
# for type 0 use cmap.
43+
# for symbol and zapfdingbats - use font encoding
44+
# for others use STD Encoding
45+
# Reading encoding from the font files in case of Symbolic fonts are not supported.
46+
# Font subset is addressed with font name identification.
47+
function merge_encoding!(fum::FontUnicodeMapping, encoding::CosNullType,
48+
doc::CosDoc, font::CosObject)
49+
subtype = cosDocGetObject(doc, font, cn"Subtype")
50+
(subtype != cn"Type1") && (subtype != cn"MMType1") && return fum
51+
basefont = cosDocGetObject(doc, font, cn"BaseFont")
52+
basefont_with_subset = CDTextString(basefont)
53+
basefont_str = rsplit(basefont_with_subset, '+';limit=2)[end]
54+
enc = (basefont_str == "Symbol") ? SYMEncoding_to_Unicode :
55+
(basefont_str == "ZapfDigbats") ? ZAPEncoding_to_Unicode :
56+
STDEncoding_to_Unicode
57+
merge!(fum.encoding, enc)
58+
return fum
59+
end
60+
61+
function merge_encoding!(fum::FontUnicodeMapping,
62+
encoding::Union{CosDict, CosIndirectObject{CosDict}},
63+
doc::CosDoc, font::CosObject)
64+
baseenc = cosDocGetObject(doc, encoding, cn"BaseEncoding")
65+
baseenc !== CosNull && merge_encoding!(fum, baseenc, doc, font)
66+
# Add the Differences
67+
diff = cosDocGetObject(doc, encoding, cn"Differences")
68+
diff === CosNull && return fum
69+
values = get(diff)
70+
d = Dict()
71+
cid = 0
72+
for v in values
73+
if v isa CosInt
74+
cid = get(v)
75+
else
76+
@assert cid != 0
77+
d[cid] = v
78+
cid += 1
79+
end
80+
end
81+
dict_to_unicode = dict_remap(d, AGL_Glyph_to_Unicode)
82+
merge!(fum.encoding, dict_to_unicode)
83+
return fum
84+
end
85+
86+
function merge_encoding!(fum::FontUnicodeMapping, doc::CosDoc, font::CosObject)
87+
encoding = cosDocGetObject(doc, font, cn"Encoding")
88+
merge_encoding!(fum, encoding, doc, font)
89+
# toUnicode = cosDocGetObject(doc, font, cn"ToUnicode")
90+
# toUnicode == CosNull && return fum
91+
# merge_encoding!(fum, toUnicode, doc, font)
92+
end
93+
94+
function merge_encoding!(fum::FontUnicodeMapping, cmap::CosIndirectObject{CosStream},
95+
doc::CosDoc, font::CosObject)
96+
fum.toUnicode = read_cmap(get(cmap))
97+
return fum
98+
end
99+
100+
get_encoded_string(s::CosString, fum::Void) = CDTextString(s)
101+
102+
function get_encoded_string(s::CosString, fum::FontUnicodeMapping)
103+
fum.hasCMap && return get_encoded_string(s, fum.cmap)
104+
carr = NativeEncodingToUnicode(Vector{UInt8}(s), fum.encoding)
105+
return String(carr)
106+
end
107+
108+
# Placeholder only
109+
get_encoded_string(s::CosString, cmap::CMap) = CDTextString(s)
110+
128111
function cmap_command(b::Vector{UInt8})
129112
b != beginbfchar && b != beginbfrange && b != begincodespacerange && return nothing
130113
return Symbol(String(b))
@@ -170,11 +153,9 @@ end
170153
on_cmap_command(stm::BufferedInputStream, command::CosObject,
171154
params::Vector{CosInt}, cmap::CMap) = nothing
172155

173-
function read_cmap(cmap::CosObject)
174-
cmap === CosNull && return CosNull
156+
function read_cmap(stm::BufferedInputStream)
175157
tcmap = CMap()
176158
params = Vector{CosInt}()
177-
stm = get(cmap)
178159
while !eof(stm)
179160
obj = parse_value(stm, cmap_command)
180161
if isa(obj, CosInt)

src/PDPage.jl

Lines changed: 10 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ mutable struct PDPageImpl <: PDPage
8080
cospage::CosObject
8181
contents::CosObject
8282
content_objects::Nullable{PDPageObjectGroup}
83-
fonts::Dict
83+
fums::Dict{CosName, FontUnicodeMapping}
8484
PDPageImpl(doc,cospage,contents)=
8585
new(doc, cospage, contents, Nullable{PDPageObjectGroup}(), Dict())
8686
end
@@ -145,68 +145,12 @@ function load_page_objects(page::PDPageImpl, stm::CosArray)
145145
end
146146
end
147147

148-
function merge_encoding!(pdfont::PDFont, encoding::CosName, page::PDPage, font::CosObject)
149-
encoding_mapping = encoding == cn"WinAnsiEncoding" ? WINEncoding_to_Unicode :
150-
encoding == cn"MacRomanEncoding" ? MACEncoding_to_Unicode :
151-
encoding == cn"MacExpertEncoding" ? MEXEncoding_to_Unicode :
152-
STDEncoding_to_Unicode
153-
merge!(pdfont.encoding, encoding_mapping)
154-
return pdfont
155-
end
156-
157-
# for type 0 use cmap.
158-
# for symbol and zapfdingbats - use font encoding
159-
# for others use STD Encoding
160-
# Reading encoding from the font files in case of Symbolic fonts are not supported.
161-
# Font subset is addressed with font name identification.
162-
function merge_encoding!(pdfont::PDFont, encoding::CosNullType,
163-
page::PDPage, font::CosObject)
164-
subtype = cosDocGetObject(page.doc.cosDoc, font, cn"Subtype")
165-
(subtype != cn"Type1") && (subtype != cn"MMType1") && return pdfont
166-
basefont = cosDocGetObject(page.doc.cosDoc, font, cn"BaseFont")
167-
basefont_with_subset = CDTextString(basefont)
168-
basefont_str = rsplit(basefont_with_subset, '+';limit=2)[end]
169-
enc = (basefont_str == "Symbol") ? SYMEncoding_to_Unicode :
170-
(basefont_str == "ZapfDigbats") ? ZAPEncoding_to_Unicode :
171-
STDEncoding_to_Unicode
172-
merge!(pdfont.encoding, enc)
173-
return pdfont
174-
end
175-
176-
function merge_encoding!(pdfont::PDFont,
177-
encoding::Union{CosDict, CosIndirectObject{CosDict}},
178-
page::PDPage, font::CosObject)
179-
baseenc = cosDocGetObject(page.doc.cosDoc, get(encoding, cn"BaseEncoding"))
180-
baseenc !== CosNull && merge_encoding!(pdfont, baseenc, page, font)
181-
# Add the Differences
182-
diff = cosDocGetObject(page.doc.cosDoc, get(encoding, cn"Differences"))
183-
diff === CosNull && return pdfont
184-
values = get(diff)
185-
d = Dict()
186-
cid = 0
187-
for v in values
188-
if v isa CosInt
189-
cid = get(v)
190-
else
191-
@assert cid != 0
192-
d[cid] = v
193-
cid += 1
194-
end
195-
end
196-
dict_to_unicode = dict_remap(d, AGL_Glyph_to_Unicode)
197-
merge!(pdfont.encoding, dict_to_unicode)
198-
return pdfont
199-
end
200148

201149
function populate_font_encoding(page, font, fontname)
202-
if get(page.fonts, fontname, CosNull) == CosNull
203-
pdfont = PDFont()
204-
encoding = cosDocGetObject(page.doc.cosDoc, get(font, cn"Encoding"))
205-
#diff = cosDocGetObject(page.doc.cosDoc, get(font, cn"Differences"))
206-
toUnicode = cosDocGetObject(page.doc.cosDoc, get(font, cn"ToUnicode"))
207-
#pdfont.toUnicode = read_cmap(toUnicode)
208-
merge_encoding!(pdfont, encoding, page, font)
209-
page.fonts[fontname] = pdfont
150+
if get(page.fums, fontname, CosNull) == CosNull
151+
fum = FontUnicodeMapping()
152+
merge_encoding!(fum, page.doc.cosDoc, font)
153+
page.fums[fontname] = fum
210154
end
211155
end
212156

@@ -219,21 +163,17 @@ function page_find_font(page::PDPageImpl, fontname::CosName)
219163
resref = get(pgnode, cn"Resources")
220164
resources = cosDocGetObject(cosdoc, resref)
221165
if resources !== CosNull
222-
fonts = cosDocGetObject(cosdoc, get(resources, cn"Font"))
166+
fonts = cosDocGetObject(cosdoc, resources, cn"Font")
223167
if fonts !== CosNull
224-
font = cosDocGetObject(cosdoc, get(fonts, fontname))
168+
font = cosDocGetObject(cosdoc, fonts, fontname)
225169
font !== CosNull && break
226170
end
227171
end
228-
pgnode = cosDocGetObject(cosdoc, get(pgnode, cn"Parent"))
172+
pgnode = cosDocGetObject(cosdoc, pgnode, cn"Parent")
229173
end
230174
populate_font_encoding(page, font, fontname)
231175
return font
232176
end
233177

234-
function get_encoded_string(s::CosString, fontname::CosName, page::PDPage)
235-
pdfont = get(page.fonts, fontname, nothing)
236-
pdfont == nothing && return CDTextString(s)
237-
carr = NativeEncodingToUnicode(Vector{UInt8}(s), pdfont.encoding)
238-
return String(carr)
239-
end
178+
get_encoded_string(s::CosString, fontname::CosName, page::PDPage) =
179+
get_encoded_string(s, get(page.fums, fontname, nothing))

test/files/16.cmap

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/CIDInit /ProcSet findresource begin
2+
18 dict begin
3+
begincmap
4+
/CIDSystemInfo
5+
<< /Registry (Adobe)
6+
/Ordering (UCS)
7+
/Supplement 0
8+
>> def
9+
/CMapName /Adobe-Identity-UCS def
10+
/CMapType 2 def
11+
1 begincodespacerange
12+
<0000> <FFFF>
13+
endcodespacerange
14+
1 beginbfchar
15+
<0003> <0020>
16+
endbfchar
17+
1 beginbfrange
18+
<000B> <000C> <0028>
19+
endbfrange
20+
2 beginbfchar
21+
<000F> <002C>
22+
<0011> <002E>
23+
endbfchar
24+
3 beginbfrange
25+
<0013> <001C> <0030>
26+
<0024> <0027> <0041>
27+
<0029> <002A> <0046>
28+
endbfrange
29+
1 beginbfchar
30+
<002C> <0049>
31+
endbfchar
32+
2 beginbfrange
33+
<0031> <0033> <004E>
34+
<0035> <0037> <0052>
35+
endbfrange
36+
1 beginbfchar
37+
<0039> <0056>
38+
endbfchar
39+
4 beginbfrange
40+
<0044> <0053> <0061>
41+
<0055> <005C> <0072>
42+
<00B2> <00B2> [<2014>]
43+
<00B3> <00B4> <201C>
44+
endbfrange
45+
1 beginbfchar
46+
<00B6> <2019>
47+
endbfchar
48+
endcmap
49+
CMapName currentdict /CMap defineresource pop
50+
end
51+
end

0 commit comments

Comments
 (0)