Skip to content

Commit 0607ffa

Browse files
committed
Few code cleanup and font encoding differences.
1 parent e34ab90 commit 0607ffa

File tree

7 files changed

+59
-20
lines changed

7 files changed

+59
-20
lines changed

REQUIRE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ TimeZones
66
Documenter
77
LabelNumerals
88
IntervalTrees
9+
agl_aglfn

docs/src/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ pdPageGetContents
6666
pdPageIsEmpty
6767
pdPageGetCosObject
6868
pdPageGetContentObjects
69+
pdPageExtractText
6970
```
7071
## PDF Page objects
7172
```@docs

src/CosDoc.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,20 @@ to go through checking the type of the objects before accessing the contents.
126126
"""
127127
cosDocGetObject(doc::CosDoc, obj::CosObject) = CosNull
128128

129+
"""
130+
```
131+
cosDocGetObject(doc::CosDoc, dict::CosObject, key::CosName) -> CosObject
132+
```
133+
Returns the object referenced inside the `dict` dictionary. `dict` can be a PDF dictionary
134+
object reference or an indirect object or a direct `CosDict` object.
135+
"""
136+
function cosDocGetObject(doc::CosDoc, dict::CosObject, key::CosName)
137+
if dict isa CosIndirectObjectRef
138+
dict = cosDocGetObject(doc, dict)
139+
end
140+
dict === CosNull && return CosNull
141+
return cosDocGetObject(doc, get(dict, key))
142+
end
129143

130144
function cosDocGetRoot(doc::CosDocImpl)
131145
root = doc.hasNativeXRefStm ? get(doc.xrefstm[1], CosName("Root")) :

src/CosObject.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import Base:get, length, show
33
export CosDict, CosString, CosNumeric, CosBoolean, CosTrue, CosFalse,
44
CosObject, CosNull, CosNullType,CosFloat, CosInt, CosArray, CosName,
55
CosDict, CosIndirectObjectRef, CosStream, get, set!, @cn_str,
6-
createTreeNode, CosTreeNode
6+
createTreeNode, CosTreeNode, CosIndirectObject
77

88
"""
99
```

src/PDPage.jl

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,11 @@ end
6464
```
6565
pdPageExtractText(io::IO, page::PDPage) -> IO
6666
```
67-
Extracts the text from the `page`. This extraction works only for tagged PDF files only.
67+
Extracts the text from the `page`. This extraction works best for tagged PDF files only.
68+
For PDFs not tagged, some line and word breaks will not be extracted properly.
6869
"""
6970
function pdPageExtractText(io::IO, page::PDPage)
70-
page.doc.isTagged != :tagged && throw(ErrorException(E_NOT_TAGGED_PDF))
71+
# page.doc.isTagged != :tagged && throw(ErrorException(E_NOT_TAGGED_PDF))
7172
state = Dict()
7273
state[:page] = page
7374
showtext(io, pdPageGetContentObjects(page), state)
@@ -144,16 +145,47 @@ function load_page_objects(page::PDPageImpl, stm::CosArray)
144145
end
145146
end
146147

147-
function merge_encoding(pdfont::PDFont, encoding::CosName, page::PDPage, font::CosObject)
148-
pdfont.encoding = encoding == cn"WinAnsiEncoding" ? WINEncoding_to_Unicode :
149-
encoding == cn"MacRomanEncoding" ? MACEncoding_to_Unicode :
150-
encoding == cn"MacExpertEncoding" ? MEXEncoding_to_Unicode :
151-
STDEncoding_to_Unicode
148+
function merge_encoding!(pdfont::PDFont, encoding::CosName, page::PDPage, font::CosObject)
149+
encoding_mapping = encoding == cn"WinAnsiEncoding" ? WINEncoding_to_Unicode :
150+
encoding == cn"MacRomanEncoding" ? MACEncoding_to_Unicode :
151+
encoding == cn"MacExpertEncoding" ? MEXEncoding_to_Unicode :
152+
STDEncoding_to_Unicode
153+
merge!(pdfont.encoding, encoding_mapping)
154+
return pdfont
152155
end
153156

154-
function merge_encoding(pdfont::PDFont, encoding::CosNullType,
157+
# for type 0 use cmap.
158+
# for symbol and zapfdingbats - use font encoding
159+
# for others use STD Encoding
160+
function merge_encoding!(pdfont::PDFont, encoding::CosNullType,
155161
page::PDPage, font::CosObject)
156-
pdfont.encoding = STDEncoding_to_Unicode
162+
merge!(pdfont.encoding, STDEncoding_to_Unicode)
163+
return pdfont
164+
end
165+
166+
function merge_encoding!(pdfont::PDFont,
167+
encoding::Union{CosDict, CosIndirectObject{CosDict}},
168+
page::PDPage, font::CosObject)
169+
baseenc = cosDocGetObject(page.doc.cosDoc, get(encoding, cn"BaseEncoding"))
170+
baseenc !== CosNull && merge_encoding!(pdfont, baseenc, page, font)
171+
# Add the Differences
172+
diff = cosDocGetObject(page.doc.cosDoc, get(encoding, cn"Differences"))
173+
diff === CosNull && return pdfont
174+
values = get(diff)
175+
d = Dict()
176+
cid = 0
177+
for v in values
178+
if v isa CosInt
179+
cid = get(v)
180+
else
181+
@assert cid != 0
182+
d[cid] = v
183+
cid += 1
184+
end
185+
end
186+
dict_to_unicode = dict_remap(d, AGL_Glyph_to_Unicode)
187+
merge!(pdfont.encoding, dict_to_unicode)
188+
return pdfont
157189
end
158190

159191
function populate_font_encoding(page, font, fontname)
@@ -163,7 +195,7 @@ function populate_font_encoding(page, font, fontname)
163195
#diff = cosDocGetObject(page.doc.cosDoc, get(font, cn"Differences"))
164196
toUnicode = cosDocGetObject(page.doc.cosDoc, get(font, cn"ToUnicode"))
165197
#pdfont.toUnicode = read_cmap(toUnicode)
166-
merge_encoding(pdfont, encoding, page, font)
198+
merge_encoding!(pdfont, encoding, page, font)
167199
page.fonts[fontname] = pdfont
168200
end
169201
end

src/Utils.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,6 @@ function dict_remap(ab, bc)
4242
d = Dict()
4343
for (a, b) in ab
4444
c = get(bc, b, zero(valtype(bc)))
45-
if a == 0x20
46-
println("a $a: b $b :c -$c-")
47-
end
4845
d[a] = c
4946
end
5047
return d

test/runtests.jl

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,6 @@ include("debugIO.jl")
3030
pdDocClose(doc)
3131
length(utilPrintOpenFiles()) == 0
3232
end
33-
@test_throws ErrorException begin
34-
doc = pdDocOpen("files/1.pdf")
35-
page = pdDocGetPage(doc, 1)
36-
pdPageExtractText(IOBuffer(), page)
37-
end
3833
end
3934

4035
@testset "PDF File with ObjectStreams" begin
@@ -81,7 +76,6 @@ include("debugIO.jl")
8176
try
8277
npage= pdDocGetPageCount(doc)
8378
for i=1:npage
84-
println(i)
8579
page = pdDocGetPage(doc, i)
8680
if pdPageIsEmpty(page) == false
8781
pdPageGetContentObjects(page)

0 commit comments

Comments
 (0)