Skip to content

Commit 7e9fb95

Browse files
committed
Support for Symbol and ZapfDingbats
Also fixes bugs related to octal character parsing in CosLiteralString
1 parent 0607ffa commit 7e9fb95

File tree

7 files changed

+91
-26
lines changed

7 files changed

+91
-26
lines changed

431.txt

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/CosReader.jl

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,18 +114,14 @@ function parse_array(ps::BufferedInputStream)
114114
end
115115

116116
function read_octal_escape!(c, ps)
117-
local n::UInt16 = (c << 3)
117+
local n::UInt8 = getnumval(c)
118118
for _ in 1:2
119119
b = peek(ps)
120-
n = n << 3
121-
if (ispdfodigit(b))
122-
n += b
123-
skip(ps,1)
124-
else
125-
break
126-
end
120+
!ispdfodigit(b) && return n
121+
n = (n << 3) + getnumval(b)
122+
skip(ps,1)
127123
end
128-
n
124+
return n
129125
end
130126

131127

@@ -139,7 +135,7 @@ function parse_string(ps::BufferedInputStream)
139135
if c == BACKSLASH
140136
c = advance!(ps)
141137
if ispdfodigit(c) #Read octal digits
142-
append!(b, Vector{UInt8}(string(read_octal_escape!(c,ps))))
138+
append!(b, read_octal_escape!(c,ps))
143139
elseif is_crorlf(c) #ignore the solidus, EOLs and move on
144140
chomp_space!(ps)
145141
else

src/PDFonts.jl

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,53 @@ endcmap
5757
CMapName currentdict /CMap defineresource pop
5858
end
5959
end
60+
61+
Single byte ToUnicode CMap
62+
63+
/CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo <<
64+
/Registry (F15+0) /Ordering (T1UV) /Supplement 0 >> def
65+
/CMapName /F15+0 def
66+
/CMapType 2 def
67+
1 begincodespacerange <01> <c9> endcodespacerange
68+
18 beginbfchar
69+
<05> <260E>
70+
<0a> <261B>
71+
<0b> <261E>
72+
<20> <0020>
73+
<29> <2605>
74+
<4d> <25CF>
75+
<4e> <274D>
76+
<4f> <25A0>
77+
<54> <25B2>
78+
<55> <25BC>
79+
<56> <25C6>
80+
<57> <2756>
81+
<58> <25D7>
82+
<75> <2663>
83+
<76> <2666>
84+
<77> <2665>
85+
<78> <2660>
86+
<a2> <2192>
87+
endbfchar
88+
15 beginbfrange
89+
<01> <04> <2701>
90+
<06> <09> <2706>
91+
<0c> <1f> <270C>
92+
<21> <28> <2720>
93+
<2a> <4c> <2729>
94+
<50> <53> <274F>
95+
<59> <5f> <2758>
96+
<60> <6d> <F8D7>
97+
<6e> <74> <2761>
98+
<79> <82> <2460>
99+
<83> <a1> <2776>
100+
<a3> <a4> <2194>
101+
<a5> <a7> <2798>
102+
<a8> <bb> <279C>
103+
<bc> <c9> <27B1>
104+
endbfrange
105+
endcmap CMapName currentdict /CMap defineresource pop end end
106+
60107
=#
61108
const beginbfchar = b"beginbfchar"
62109
const endbfchar = b"endbfchar"
@@ -139,18 +186,6 @@ function read_cmap(cmap::CosObject)
139186
return tcmap
140187
end
141188

142-
get_encoded_string(s, font::CosNullType, page) = CDTextString(s)
143-
144-
get_encoded_string(s, font, page::CosNullType) = CDTextString(s)
145-
146-
# Simply applying ISO_8859-1. Not correct actually encoding tables to be consulted.
147-
# like: WinAnsiEncoding, MacRomanEncoding, MacExpertEncoding or PDFDocEncoding
148-
get_encoded_string(s::CosString, encoding::CosName) = CDTextString(s)
149-
150-
# Differences should be specifically mapped.
151-
152-
get_encoded_string(s::CosString, encoding::CosDict) = CDTextString(s)
153-
154189
#=
155190
function get_encoded_string(s::CosXString, cmap::CosObject)
156191
cmap_vec = read_cmap(cmap)

src/PDPage.jl

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ end
6565
pdPageExtractText(io::IO, page::PDPage) -> IO
6666
```
6767
Extracts the text from the `page`. This extraction works best for tagged PDF files only.
68-
For PDFs not tagged, some line and word breaks will not be extracted properly.
68+
For PDFs not tagged, some line and word breaks will not be extracted properly.
6969
"""
7070
function pdPageExtractText(io::IO, page::PDPage)
7171
# page.doc.isTagged != :tagged && throw(ErrorException(E_NOT_TAGGED_PDF))
@@ -157,9 +157,19 @@ end
157157
# for type 0 use cmap.
158158
# for symbol and zapfdingbats - use font encoding
159159
# for others use STD Encoding
160+
# Reading encoding from the font files in case of Symbolic fonts are not supported.
161+
# Font subset is addressed with font name identification.
160162
function merge_encoding!(pdfont::PDFont, encoding::CosNullType,
161163
page::PDPage, font::CosObject)
162-
merge!(pdfont.encoding, STDEncoding_to_Unicode)
164+
subtype = cosDocGetObject(page.doc.cosDoc, font, cn"Subtype")
165+
(subtype != cn"Type1") && (subtype != cn"MMType1") && return pdfont
166+
basefont = cosDocGetObject(page.doc.cosDoc, font, cn"BaseFont")
167+
basefont_with_subset = CDTextString(basefont)
168+
basefont_str = rsplit(basefont_with_subset, '+';limit=2)[end]
169+
enc = (basefont_str == "Symbol") ? SYMEncoding_to_Unicode :
170+
(basefont_str == "ZapfDigbats") ? ZAPEncoding_to_Unicode :
171+
STDEncoding_to_Unicode
172+
merge!(pdfont.encoding, enc)
163173
return pdfont
164174
end
165175

src/Utils.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ function NativeEncodingToUnicode(barr::Vector{UInt8}, mapping::Dict)
6262
l = length(barr)
6363
carr = Vector{Char}(l)
6464
for i = 1:l
65-
carr[i] = get(mapping, barr[i], Char(0x0))
65+
carr[i] = get(mapping, barr[i], zero(Char))
6666
end
6767
return carr
6868
end

src/bytes.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export BACKSPACE, NULL,TAB, LINE_FEED,NEWLINE, FORM_FEED, RETURN,
99
PERCENT,PERIOD, NUMBER_SIGN, BANG, TILDE, LATIN_UPPER_D,STREAM,ENDSTREAM,
1010
LATIN_UPPER_E,LATIN_UPPER_F, LATIN_UPPER_O, LATIN_UPPER_P,
1111
LATIN_UPPER_R,XREF, TRAILER, STARTXREF, EOF, OBJ, ENDOBJ, ispdfspace,
12-
ispdfdelimiter,ispdfdigit, ispdfodigit, ispdfxdigit, gethexval, is_crorlf
12+
ispdfdelimiter, ispdfdigit, ispdfodigit, ispdfxdigit, gethexval, getnumval, is_crorlf
1313

1414

1515

@@ -146,4 +146,7 @@ gethexval(b::UInt8) = (DIGIT_ZERO <= b <= DIGIT_NINE) ? b - DIGIT_ZERO :
146146
(LATIN_A <= b <= LATIN_F) ? b - LATIN_A + 0xa :
147147
throw(ErrorException(E_BAD_NUMBER))
148148

149+
getnumval(b::UInt8) = (DIGIT_ZERO <= b <= DIGIT_NINE) ? b - DIGIT_ZERO :
150+
throw(ErrorException(E_BAD_NUMBER))
151+
149152
is_crorlf(b::UInt8) = ((b == RETURN) ||(b == LINE_FEED))

test/runtests.jl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,26 @@ include("debugIO.jl")
171171
end
172172
end
173173

174+
@testset "Symbol Fonts test" begin
175+
@test begin
176+
filename="431.pdf"
177+
DEBUG && println(filename)
178+
isfile(filename) ||
179+
download("http://www.stillhq.com/pdfdb/000431/data.pdf",filename)
180+
doc = pdDocOpen(filename)
181+
(npage = pdDocGetPageCount(doc)) == 54
182+
for i=1:npage
183+
page = pdDocGetPage(doc, i)
184+
if pdPageIsEmpty(page) == false
185+
pdPageGetContentObjects(page)
186+
pdPageExtractText(IOBuffer(), page)
187+
end
188+
end
189+
pdDocClose(doc)
190+
length(utilPrintOpenFiles()) == 0
191+
end
192+
end
193+
174194
files=readdir(get_tempdir())
175195
@assert length(files) == 0
176196
end

0 commit comments

Comments
 (0)