Skip to content

Commit 7c4c103

Browse files
authored
Enhancing the font unicode mapping framework. (#71)
* Adding better fonts and encoding support.
1 parent 708f841 commit 7c4c103

File tree

7 files changed

+175
-91
lines changed

7 files changed

+175
-91
lines changed

src/CosObject.jl

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,19 @@ show(io::IO, o::CosName) = print(io, "/", String(o))
529529

530530
show(io::IO, o::CosXString) = print(io, "<", String(copy(o.val)), ">")
531531

532-
show(io::IO, o::CosLiteralString) = print(io, "(", String(copy(o.val)), ")")
532+
function show(io::IO, o::CosLiteralString)
533+
print(io, '(')
534+
for b in o.val
535+
c = Char(b)
536+
if isprint(c)
537+
print(io, c)
538+
else
539+
print(io, '\\')
540+
print(io, string(b, base=8, pad=3))
541+
end
542+
end
543+
print(io, ')')
544+
end
533545

534546
function show(io::IO, o::CosArray)
535547
print(io, '[')

src/CosReader.jl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,20 @@ function parse_pdfOpsOrConst(ps::IO, fparse_more::Function)
8585
push!(b, c)
8686
end
8787
ns = chomp_space!(ps)
88-
obj = get_pdfconstant(b)
89-
obj !== nothing && return obj
90-
nused, ret = fparse_more(b)
91-
if nused < length(b)
88+
# It's ok to skip all spaces till eof
89+
# but if eof is not reached read only one space
90+
# If no space is found then do not advance
91+
reset_marker = ns > 0 && !eof(ps)
92+
nused, ret = length(b)+1, get_pdfconstant(b)
93+
if ret === nothing
94+
nused, ret = fparse_more(b)
95+
if nused < length(b)
96+
reset_marker = true
97+
else
98+
nused += 1
99+
end
100+
end
101+
if reset_marker
92102
reset(ps)
93103
skip(ps, nused)
94104
end

src/PDFonts.jl

Lines changed: 82 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import ..Cos: CosXString
22

3-
export pdFontIsBold,
3+
export
4+
pdFontIsBold,
45
pdFontIsItalic,
56
pdFontIsFixedW,
67
pdFontIsAllCap,
78
pdFontIsSmallCap
89

910
using Rectangle
1011

11-
1212
#=
1313
Sample CMaps are available now as 8.cmap and 16.cmap in the test/files directory
1414
for 8 and 16-bit toUnicode CMaps.
@@ -44,50 +44,81 @@ function show(io::IO, cmap::CMap)
4444
show(io, cmap.range_map)
4545
end
4646

47+
48+
const FontUnicodeMapping = Union{Dict{UInt8, Char}, CMap, Nothing}
49+
50+
#=
4751
mutable struct FontUnicodeMapping
4852
encoding::Dict{UInt8, Char}
4953
cmap::CMap
5054
hasCMap::Bool
5155
FontUnicodeMapping() = new(Dict{UInt8, Char}(), CMap(), false)
5256
end
57+
=#
5358

54-
function merge_encoding!(fum::FontUnicodeMapping, encoding::CosName,
59+
function merge_encoding!(fum::Dict{UInt8, Char}, encoding::CosName,
5560
doc::CosDoc, font::IDDRef{CosDict})
5661
encoding_mapping =
5762
encoding == cn"WinAnsiEncoding" ? WINEncoding_to_Unicode :
5863
encoding == cn"MacRomanEncoding" ? MACEncoding_to_Unicode :
5964
encoding == cn"MacExpertEncoding" ? MEXEncoding_to_Unicode :
6065
STDEncoding_to_Unicode
61-
merge!(fum.encoding, encoding_mapping)
66+
merge!(fum, encoding_mapping)
6267
return fum
6368
end
6469

65-
# for type 0 use cmap.
66-
# for symbol and zapfdingbats - use font encoding
67-
# for others use STD Encoding
68-
# Reading encoding from the font files in case of Symbolic fonts are not
69-
# supported.
70-
# Font subset is addressed with font name identification.
71-
function merge_encoding!(fum::FontUnicodeMapping, encoding::CosNullType,
72-
doc::CosDoc, font::IDDRef{CosDict})
73-
subtype = cosDocGetObject(doc, font, cn"Subtype")
74-
subtype !== cn"Type1" && subtype !== cn"MMType1" && return fum
70+
abstract type FontType end
71+
struct FontType1 <: FontType end
72+
struct FontType3 <: FontType end
73+
struct FontMMType1 <: FontType end
74+
struct FontTrueType <: FontType end
75+
struct FontDefType <: FontType end
76+
77+
function FontType(subtype::CosName)
78+
subtype === cn"Type1" && return FontType1()
79+
subtype === cn"Type3" && return FontType3()
80+
subtype === cn"TrueType" && return FontTrueType()
81+
subtype === cn"MMType1" && return FontMMType1()
82+
return FontDefType()
83+
end
84+
85+
merge_encoding!(fum::FontUnicodeMapping, ftype::FontType,
86+
doc::CosDoc, font::IDDRef{CosDict}) = fum
87+
88+
function merge_encoding!(fum::Dict{UInt8, Char},
89+
ftype::Union{FontType1, FontMMType1},
90+
doc::CosDoc, font::IDDRef{CosDict})
7591
basefont = cosDocGetObject(doc, font, cn"BaseFont")
7692
basefont_with_subset = CDTextString(basefont)
7793
basefont_str = rsplit(basefont_with_subset, '+';limit=2)[end]
7894
enc = basefont_str == "Symbol" ? SYMEncoding_to_Unicode :
7995
basefont_str == "ZapfDingbats" ? ZAPEncoding_to_Unicode :
8096
STDEncoding_to_Unicode
81-
merge!(fum.encoding, enc)
97+
merge!(fum, enc)
8298
return fum
99+
end
100+
101+
# for type 0 use cmap.
102+
# for symbol and zapfdingbats - use font encoding
103+
# for others use STD Encoding
104+
# Reading encoding from the font files in case of Symbolic fonts are not
105+
# supported.
106+
# Font subset is addressed with font name identification.
107+
function merge_encoding!(fum::Dict{UInt8, Char}, encoding::CosNullType,
108+
doc::CosDoc, font::IDDRef{CosDict})
109+
subtype = cosDocGetObject(doc, font, cn"Subtype")
110+
subtype === CosNull && return fum
111+
return merge_encoding!(fum, FontType(subtype), doc, font)
83112
end
84113

85-
function merge_encoding!(fum::FontUnicodeMapping,
114+
function merge_encoding!(fum::Dict{UInt8, Char},
86115
encoding::IDD{CosDict},
87116
doc::CosDoc, font::IDDRef{CosDict})
88117
baseenc = cosDocGetObject(doc, encoding, cn"BaseEncoding")
89118
merge_encoding!(fum, baseenc, doc, font)
90119
# Add the Differences
120+
subtype = cosDocGetObject(doc, font, cn"Subtype")
121+
subtype === cn"Type3" && return fum
91122
diff = cosDocGetObject(doc, encoding, cn"Differences")
92123
diff === CosNull && return fum
93124
values = get(diff)
@@ -101,31 +132,28 @@ function merge_encoding!(fum::FontUnicodeMapping,
101132
cid += 1
102133
end
103134
end
135+
104136
dict_to_unicode = dict_remap(d, AGL_Glyph_to_Unicode)
105-
merge!(fum.encoding, dict_to_unicode)
137+
merge!(fum, dict_to_unicode)
106138
return fum
107139
end
108140

109-
function merge_encoding!(fum::FontUnicodeMapping, doc::CosDoc,
110-
font::IDDRef{CosDict})
111-
encoding = cosDocGetObject(doc, font, cn"Encoding")
112-
merge_encoding!(fum, encoding, doc, font)
141+
function get_unicode_mapping(doc::CosDoc, font::IDDRef{CosDict})
113142
toUnicode = cosDocGetObject(doc, font, cn"ToUnicode")
114-
toUnicode == CosNull && return fum
115-
merge_encoding!(fum, toUnicode, doc, font)
143+
toUnicode !== CosNull &&
144+
return get_unicode_mapping(toUnicode)
145+
encoding = cosDocGetObject(doc, font, cn"Encoding")
146+
d = merge_encoding!(Dict{UInt8, Char}(), encoding, doc, font)
147+
return length(d) == 0 ? nothing : d
116148
end
117149

118-
function merge_encoding!(fum::FontUnicodeMapping,
119-
cmap::CosIndirectObject{CosStream},
120-
doc::CosDoc, font::IDDRef{CosDict})
121-
stm_cmap = get(cmap)
150+
function get_unicode_mapping(cmap_stm::CosIndirectObject{CosStream})
151+
io = get(cmap_stm)
122152
try
123-
fum.cmap = read_cmap(stm_cmap)
124-
fum.hasCMap = true
153+
return read_cmap(io)
125154
finally
126-
util_close(stm_cmap)
155+
util_close(io)
127156
end
128-
return fum
129157
end
130158

131159
function update_glyph_id_std_14(cosdoc, cosfont,
@@ -185,22 +213,19 @@ function get_glyph_id_mapping(cosdoc::CosDoc, cosfont::IDD{CosDict})
185213
else
186214
glyph_name_to_cid[v] = cid
187215
cid_to_glyph_name[cid] = v
188-
cid += 1
216+
cid += 0x1
189217
end
190218
end
191219
return glyph_name_to_cid, cid_to_glyph_name
192220
end
193221

194-
get_encoded_string(s::CosString, fum::Nothing) = CDTextString(s)
195-
196-
get_encoded_string(s::CosString, fum::FontUnicodeMapping) =
222+
get_encoded_string(s::CosString, fum::Union{Dict{UInt8, Char}, CMap}) =
197223
get_encoded_string(Vector{UInt8}(s), fum)
198224

199-
@inline function get_encoded_string(v::Union{Vector{UInt8}, NTuple{N, UInt8}},
200-
fum::FontUnicodeMapping) where N
225+
function get_encoded_string(v::Union{Vector{UInt8}, NTuple{N, UInt8}},
226+
fum::Dict{UInt8, Char}) where N
201227
length(v) == 0 && return ""
202-
fum.hasCMap && return get_encoded_string(v, fum.cmap)
203-
return String(NativeEncodingToUnicode(v, fum.encoding))
228+
return String(NativeEncodingToUnicode(v, fum))
204229
end
205230

206231
function get_unicode_chars(b::UInt8, i::Interval, v::Union{CosXString, CosArray})
@@ -225,28 +250,26 @@ function get_unicode_chars(barr::Vector{UInt8})
225250
nb = 0
226251
retarr = Vector{Char}()
227252
while nb < l
228-
b1 = barr[1]
229-
b2 = barr[2]
253+
b1, b2 = barr[1], barr[2]
230254
nb += 2
231-
c::UInt32 = 0
255+
c = 0
232256
if 0xD8 <= b1 <= 0xDB
233257
# UTF-16 Supplementary plane = 4 bytes
234258
b1 -= 0xD8
235259
c = b1
236-
c = (c << 8) + b2
260+
c = c*256 + b2
237261
b3 = barr[3]
238262
b4 = barr[4]
239263
nb += 2
240264
if 0xDC <= b3 <= 0xDF
241265
b3 -= 0xDC
242266
c1 = b3
243-
c1 = (c1 << 8) + b4
244-
c = (c << 10) + c1
267+
c1 = c1*256 + b4
268+
c = c*1024 + c1
245269
c += 0x10000
246270
end
247271
else
248-
c = b1
249-
c = (c << 8) + b2
272+
c = b1*256 + b2
250273
end
251274
push!(retarr, Char(c))
252275
end
@@ -385,16 +408,17 @@ mutable struct PDFont
385408
cid_to_glyph_name::Dict{UInt8, CosName}
386409
flags::UInt32
387410
fontname::CosName
411+
props::Dict
388412
@inline function PDFont(doc::PDDoc, cosfont::IDD{CosDict})
389-
fum = FontUnicodeMapping()
390-
merge_encoding!(fum, doc.cosDoc, cosfont)
413+
fum = get_unicode_mapping(doc.cosDoc, cosfont)
391414
widths = get_font_widths(doc.cosDoc, cosfont)
392415
glyph_name_to_cid, cid_to_glyph_name =
393416
get_glyph_id_mapping(doc.cosDoc, cosfont)
394417
flags = get_font_flags(doc, cosfont, widths)
395418
fontname = get_font_name(doc, cosfont, widths)
419+
props = Dict()
396420
return new(doc, cosfont, widths, fum, glyph_name_to_cid,
397-
cid_to_glyph_name, flags, fontname)
421+
cid_to_glyph_name, flags, fontname, props)
398422
end
399423
end
400424

@@ -403,6 +427,12 @@ SPACE_CODE(w::CIDWidth) = get_character_code(cn"space", w)
403427
INIT_CODE(x) = 0x00
404428
SPACE_CODE(x) = get_character_code(cn"space", x)
405429

430+
function FontType(font::PDFont)
431+
subtype = cosDocGetObject(font.doc.cosDoc, font.obj, cn"Subtype")
432+
subtype === CosNull && return FontDefType()
433+
return FontType(subtype)
434+
end
435+
406436
"""
407437
```
408438
pdFontIsBold(pdfont::PDFont) ->Bool
@@ -495,6 +525,8 @@ get_character_code(name::CosName, w) =
495525

496526
get_encoded_string(s, pdfont::PDFont) = get_encoded_string(s, pdfont.fum)
497527

528+
get_encoded_string(s, pdfont::Nothing) = CDTextString(s)
529+
498530
get_char(barr, w) = iterate(barr)
499531
function get_char(barr, w::CIDWidth)
500532
next = iterate(barr)

src/PDPage.jl

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@ export PDPage,
77
pdPageGetMediaBox,
88
pdPageGetCropBox,
99
pdPageExtractText,
10-
pdPageGetPageNumber
10+
pdPageGetPageNumber,
11+
pdPageEvalContent
1112

1213
using ..Cos
1314

14-
abstract type PDPage end
15-
1615
"""
1716
```
1817
pdPageGetCosObject(page::PDPage) -> CosObject
@@ -270,14 +269,6 @@ end
270269
return load_page_objects(page, stm)
271270
end
272271

273-
function populate_font_encoding(page, font, fontname)
274-
if get(page.fums, fontname, CosNull) === CosNull
275-
fum = FontUnicodeMapping()
276-
merge_encoding!(fum, page.doc.cosDoc, font)
277-
page.fums[fontname] = fum
278-
end
279-
end
280-
281272
function find_resource(page::PDPageImpl,
282273
restype::CosName,
283274
fontname::Union{CosName, CosNullType})
@@ -344,3 +335,4 @@ get_encoded_string(s::CosString, fontname::CosNullType, page::PDPage) =
344335

345336
get_encoded_string(s::CosString, fontname::CosName, page::PDPage) =
346337
get_encoded_string(s, get(page.fonts, fontname, nothing))
338+

0 commit comments

Comments
 (0)