Skip to content

Commit 93320b4

Browse files
committed
Extraction of text from tagged PDF.
1 parent 25699ce commit 93320b4

File tree

9 files changed

+275
-136
lines changed

9 files changed

+275
-136
lines changed

src/CosDoc.jl

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,25 @@ mutable struct CosDocImpl <: CosDoc
4646
end
4747
end
4848

49+
"""
50+
```
51+
show(io::IO, doc::CosDoc)
52+
```
53+
Prints the CosDoc. The intent is to print lesser information from the structure.
54+
"""
55+
function show(io::IO, doc::CosDoc)
56+
print(io, "\nCosDoc ==>\n")
57+
print(io, "\tfilepath:\t\t$(doc.filepath)\n")
58+
print(io, "\tsize:\t\t\t$(doc.size)\n")
59+
print(io, "\thasNativeXRefStm:\t $(doc.hasNativeXRefStm)\n")
60+
print(io, "\tTrailer dictionaries: \n")
61+
for t in doc.trailer
62+
print(io, '\t')
63+
print(io, t)
64+
print(io, '\n')
65+
end
66+
end
67+
4968
"""
5069
```
5170
cosDocClose(doc::CosDoc)
@@ -432,7 +451,7 @@ cosDocGetPageNumbers(doc::CosDoc, catalog::CosObject, label::AbstractString) ->
432451
```
433452
PDF utilizes two pagination schemes. An internal global page number that is maintained
434453
serially as an integer and `PageLabel` that is shown by the viewers. Given a `label` this
435-
method returns a `range` of valid page numbers for the given label.
454+
method returns a `range` of valid page numbers for the given label.
436455
"""
437456
function cosDocGetPageNumbers(doc::CosDoc, catalog::CosObject, label::AbstractString)
438457
ref = get(catalog, cn"PageLabels")

src/CosObject.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ abstract type CosObject end
4343
```
4444
"""
4545
abstract type CosString <: CosObject end
46+
47+
@inline get{T<:CosString}(o::T) = copy(o.val)
4648
"""
4749
```
4850
CosNumeric
@@ -382,7 +384,7 @@ function show(io::IO, o::CosDict)
382384
end
383385

384386
show(io::IO, stm::CosStream) =
385-
(show(io, stm.extent); print(io, "stream\n...\nendstream\n"))
387+
(show(io, stm.extent); print(io, "\nstream\n...\nendstream"))
386388

387389
show(io::IO, os::CosObjectStream) = show(io, os.stm)
388390

src/CosObjectHelpers.jl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,21 @@ function convert(::Type{CDTextString}, xstr::CosXString)
88
const feff = [LATIN_F, LATIN_E, LATIN_F, LATIN_F]
99
const FEFF = [LATIN_UPPER_F, LATIN_UPPER_E, LATIN_UPPER_F, LATIN_UPPER_F]
1010
prefix = xstr.val[1:4]
11+
hasPrefix = (prefix == feff || prefix == FEFF)
12+
isUTF16 = hasPrefix || prefix[1:2] == UInt8[0x00, 0x00]
1113
data = xstr.val
1214
buffer = data |> String |> hex2bytes
13-
if prefix == feff || prefix == FEFF
15+
if isUTF16
1416
len2 = div(length(buffer),2)
15-
utf_16_arr = Vector{UInt16}(len2-1)
17+
utf_16_arr = Vector{UInt16}(hasPrefix ? len2-1 : len2)
1618
utf_16_data = reinterpret(UInt8, utf_16_arr)
1719
if (0x04030201 == ENDIAN_BOM)
1820
for i=1:len2
1921
(buffer[2i-1], buffer[2i]) = (buffer[2i], buffer[2i-1])
2022
end
2123
end
22-
copy!(utf_16_data, 1, buffer, 3, 2len2-2)
24+
hasPrefix ? copy!(utf_16_data, 1, buffer, 3, 2len2-2) :
25+
copy!(utf_16_data, 1, buffer, 1, 2len2)
2326
str = transcode(String, utf_16_arr)
2427
else
2528
# Assume PDFDocEncoding (ISO-8859-1)

src/PDDocImpl.jl

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,30 @@ mutable struct PDDocImpl <: PDDoc
22
cosDoc::CosDoc
33
catalog::CosObject
44
pages::CosObject
5+
structTreeRoot::CosObject
56
isTagged::Symbol #Valid values :tagged, :none and :suspect
67
function PDDocImpl(fp::String)
78
cosDoc = cosDocOpen(fp)
89
catalog = cosDocGetRoot(cosDoc)
9-
new(cosDoc,catalog,CosNull,:none)
10+
new(cosDoc,catalog,CosNull,CosNull,:none)
1011
end
1112
end
1213

14+
"""
15+
```
16+
show(io::IO, doc::PDDoc)
17+
```
18+
Prints the PDDoc. The intent is to print lesser information from the structure.
19+
"""
20+
function Base.show(io::IO, doc::PDDoc)
21+
print(io, "\nPDDoc ==>\n")
22+
print(io, doc.cosDoc)
23+
print(io, '\n')
24+
print(io, "Catalog:")
25+
print(io, doc.catalog)
26+
print(io, "isTagged: $(doc.isTagged)\n")
27+
end
28+
1329
"""
1430
Recursively reads the page object and populates the indirect objects
1531
Ensures indirect objects are read and updated in the xref Dictionary.
@@ -66,6 +82,34 @@ function find_page_from_treenode(node::CosObject, pageno::Int)
6682
throw(ErrorException(E_INVALID_PAGE_NUMBER))
6783
end
6884

85+
mutable struct StructTreeRoot
86+
k::CosObject # Dict, Array or null
87+
idTree::Nullable{CosTreeNode{CDTextString}} # Name tree
88+
parentTree::Nullable{CosTreeNode{Int}} # Number tree
89+
parentTreeNext::Int
90+
roleMap::CosObject # Dict or null
91+
classMap::CosObject # Dict or null
92+
end
93+
94+
mutable struct StructElem
95+
s::CosName
96+
p::CosObject # Indirect Dict
97+
id::Vector{UInt8}
98+
pg::CosObject # Dict
99+
k::Union{StructElem, CosObject}
100+
a::CosObject
101+
r::Int
102+
t::CDTextString
103+
lang::CDTextString
104+
alt::CDTextString
105+
e::CDTextString
106+
actualText::CDTextString
107+
end
108+
109+
110+
# The structure tree is not fully loaded but the object linkages are established for future
111+
# correlations during text extraction.
112+
69113
function update_structure_tree(doc::PDDocImpl)
70114
catalog = pdDocGetCatalog(doc)
71115
marking = get(catalog, cn"MarkInfo")
@@ -76,5 +120,8 @@ function update_structure_tree(doc::PDDocImpl)
76120
doc.isTagged = (suspect === CosTrue) ? (:suspect) :
77121
(tagged === CosTrue) ? (:tagged) : (:none)
78122
end
123+
124+
structTreeRef = get(catalog, cn"StructTreeRoot")
125+
doc.structTreeRoot = cosDocGetObject(doc.cosDoc, structTreeRef)
79126
return nothing
80127
end

src/PDFIO.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ export PDDoc,
2929
pdPageGetContents,
3030
pdPageIsEmpty,
3131
pdPageGetCosObject,
32-
pdPageGetContentObjects
32+
pdPageGetContentObjects,
33+
pdPageExtractText
3334

3435
using .Cos
3536
export CosDoc,

src/PDPage.jl

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ export PDPage,
22
pdPageGetContents,
33
pdPageIsEmpty,
44
pdPageGetCosObject,
5-
pdPageGetContentObjects
5+
pdPageGetContentObjects,
6+
pdPageExtractText
67

78
abstract type PDPage end
89

@@ -56,6 +57,18 @@ function pdPageGetContentObjects(page::PDPage)
5657
return get(page.content_objects)
5758
end
5859

60+
"""
61+
```
62+
pdPageExtractText(io::IO, page::PDPage) -> IO
63+
```
64+
Extracts the text from the `page`. This extraction works only for tagged PDF files only.
65+
"""
66+
function pdPageExtractText(io::IO, page::PDPage)
67+
page.doc.isTagged != :tagged && throw(ErrorException(E_NOT_TAGGED_PDF))
68+
showtext(io, pdPageGetContentObjects(page))
69+
return io
70+
end
71+
5972
mutable struct PDPageImpl <: PDPage
6073
doc::PDDocImpl
6174
cospage::CosObject

0 commit comments

Comments
 (0)