Skip to content

Commit 4e0a8b6

Browse files
committed
Using text layout in text extraction.
Now will work for non-tagged files as well.
1 parent 0ce7fa2 commit 4e0a8b6

File tree

7 files changed

+6267
-491
lines changed

7 files changed

+6267
-491
lines changed

src/PDPage.jl

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,17 +64,14 @@ end
6464
```
6565
pdPageExtractText(io::IO, page::PDPage) -> IO
6666
```
67-
Extracts the text from the `page`. This extraction works best for tagged PDF files only.
67+
Extracts the text from the `page`. This extraction works best for tagged PDF files.
6868
For PDFs not tagged, some line and word breaks will not be extracted properly.
6969
"""
7070
function pdPageExtractText(io::IO, page::PDPage)
71-
# page.doc.isTagged != :tagged && throw(ErrorException(E_NOT_TAGGED_PDF))
72-
state = Vector{Dict}()
73-
yloc = Vector{Float32}()
74-
push!(state, Dict())
71+
state = init_graphics_state()
7572
state[end][:page] = page
76-
state[end][:yloc] = yloc
77-
showtext(io, pdPageGetContentObjects(page), state)
73+
evalContent!(pdPageGetContentObjects(page), state)
74+
show_text_layout!(io, state)
7875
return io
7976
end
8077

src/PDPageElement.jl

Lines changed: 216 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export PDPageObject,
99
PDPage_EndGroup
1010

1111
using BufferedStreams
12-
import Base: show
12+
import Base: show, isless
1313

1414
"""
1515
```
@@ -51,15 +51,15 @@ abstract type PDPageObject end
5151
A representation of a content object with operator and operand. See [`PDPageObject`](@ref)
5252
for more details.
5353
"""
54-
mutable struct PDPageElement <: PDPageObject
54+
mutable struct PDPageElement{S} <: PDPageObject
5555
t::Symbol
5656
version::Tuple{Int,Int}
5757
noperand::Int
5858
operands::Vector{CosObject}
5959
end
6060

6161
PDPageElement(ts::AbstractString,ver::Tuple{Int,Int},nop::Int=0)=
62-
PDPageElement(Symbol(ts),ver,nop,Vector{CosObject}())
62+
PDPageElement{Symbol(ts)}(Symbol(ts),ver,nop,Vector{CosObject}())
6363

6464
function show(io::IO, e::PDPageElement)
6565
for op in e.operands
@@ -219,8 +219,6 @@ function collect_object(grp::PDPageObjectGroup, tr::PDPageTextRun,
219219
return tr
220220
end
221221

222-
223-
224222
function collect_inline_image(img::PDPageInlineImage, name::CosName,
225223
bis::BufferedInputStream)
226224
value = parse_value(bis, get_pdfcontentops)
@@ -231,13 +229,13 @@ function collect_inline_image(img::PDPageInlineImage, elem::PDPageElement,
231229
bis::BufferedInputStream)
232230
if (elem.t == Symbol("ID"))
233231
while(!img.isRead && !eof(bis))
234-
b1 = peek(bis)
232+
b1 = BufferedStreams.peek(bis)
235233
if (b1 == LATIN_UPPER_E)
236234
mark(bis)
237235
skip(bis,1);
238-
b2 = peek(bis)
236+
b2 = BufferedStreams.peek(bis)
239237
if (b2 == LATIN_UPPER_I)
240-
skip(bis,1);b3 = peek(bis)
238+
skip(bis,1);b3 = BufferedStreams.peek(bis)
241239
if (ispdfspace(b3))
242240
skip(bis,1)
243241
img.isRead=true
@@ -458,70 +456,236 @@ const PD_CONTENT_OPERATORS = Dict(
458456
function get_pdfcontentops(b::Vector{UInt8})
459457
arr = get(PD_CONTENT_OPERATORS, String(b), CosNull)
460458
(arr == CosNull) && return CosNull
461-
return eval(Expr(:call,arr...))
459+
return eval(Expr(:call, arr...))
460+
end
461+
462+
struct TextLayout
463+
a::Float32
464+
b::Float32
465+
c::Float32
466+
d::Float32
467+
x::Float32
468+
y::Float32
469+
text::String
470+
end
471+
472+
function isless(tl1::TextLayout, tl2::TextLayout)
473+
dy = tl1.y - tl2.y
474+
dx = tl1.x - tl2.x
475+
ytol = tl1.d/2
476+
477+
dy < -ytol && return true
478+
dy > ytol && return false
479+
return dx > 0
480+
end
481+
482+
using DataStructures
483+
484+
function init_graphics_state()
485+
state = Vector{Dict}()
486+
push!(state, Dict())
487+
488+
state[end][:text_layout] = mutable_binary_maxheap(TextLayout)
489+
490+
#Graphics state
491+
state[end][:CTM] = eye(3)
492+
493+
#Text states
494+
state[end][:Tc] = 0.0
495+
state[end][:Tw] = 0.0
496+
state[end][:Tz] = 100.0
497+
state[end][:TL] = 0.0
498+
state[end][:Tr] = 0
499+
state[end][:Ts] = 0.0
500+
return state
501+
end
502+
503+
function show_text_layout!(io::IO, state::Vector{Dict})
504+
heap = state[end][:text_layout]
505+
x = 0.0
506+
y = -1.0
507+
while(!isempty(heap))
508+
tlayout = pop!(heap)
509+
#@printf "%f,%f,%f,%f,%f,%f,%s\n" tlayout.a tlayout.b tlayout.c tlayout.d tlayout.x tlayout.y tlayout.text
510+
511+
#Horizontal Text
512+
if abs(tlayout.b) < 0.001 && abs(tlayout.c) < 0.001
513+
w = abs(tlayout.a)
514+
h = abs(tlayout.d)
515+
#Vertical or any other angle text
516+
else
517+
w = h = sqrt(tlayout.a*tlayout.d - tlayout.b*tlayout.c)
518+
y = -1.0 #Reset the old positions.
519+
end
520+
@assert w > 0.1
521+
@assert h > 0.1
522+
while (y > tlayout.y + h)
523+
print(io, '\n')
524+
y -= h
525+
x = 0.0
526+
end
527+
y = tlayout.y
528+
if (x > tlayout.x)
529+
x = tlayout.x
530+
end
531+
while x < tlayout.x
532+
print(io, ' ')
533+
x += w
534+
end
535+
len = length(tlayout.text)
536+
print(io, tlayout.text)
537+
x += w*len
538+
end
462539
end
463540

464-
function showtext(io::IO, grp::PDPageObjectGroup, state::Vector{Dict}=Vector{Dict}())
541+
function evalContent!(grp::PDPageObjectGroup, state::Vector{Dict}=Vector{Dict}())
465542
for obj in grp.objs
466-
showtext(io, obj, state)
543+
evalContent!(obj, state)
467544
end
468-
return io
545+
return state
469546
end
470547

471-
function showtext(io::IO, tr::PDPageTextRun, state::Vector{Dict}=Vector{Dict}())
548+
function evalContent!(tr::PDPageTextRun, state::Vector{Dict}=Vector{Dict}())
549+
evalContent!(tr.elem, state)
550+
tfs = get(state[end], :fontsize, 0)
551+
552+
th = state[end][:Tz]/100.0
553+
ts = state[end][:Ts]
554+
555+
tsm = tfs == 0 ? eye(3) : [tfs*th 0.0 0.0; 0.0 tfs 0.0; 0.0 ts 1.0]
556+
557+
tm = state[end][:Tm]
558+
ctm = state[end][:CTM]
559+
trm = tsm*tm*ctm
560+
472561
fontname, font = get(state[end], :font, (CosNull, CosNull))
473562
page = get(state[end], :page, CosNull)
474-
(tr.elem.t == Symbol("\'") || tr.elem.t == Symbol("\"")) && print(io, "\n")
563+
text = ""
475564
for s in tr.ss
476-
text = String(get_encoded_string(s, fontname, page))
477-
write(io, text)
565+
text *= String(get_encoded_string(s, fontname, page))
478566
end
479-
return io
567+
568+
a = trm[1, 1]
569+
b = trm[1, 2]
570+
c = trm[2, 1]
571+
d = trm[2, 2]
572+
e = trm[3, 1]
573+
f = trm[3, 2]
574+
575+
heap = state[end][:text_layout]
576+
if !get(state[end], :in_artifact, false)
577+
push!(heap, TextLayout(a, b, c, d, e, f, text))
578+
end
579+
return state
480580
end
481581

482-
showtext(io::IO, pdo::PDPageTextObject, state::Vector{Dict}=Vector{Dict}()) =
483-
showtext(io, pdo.group, state)
582+
function evalContent!(pdo::PDPageTextObject, state::Vector{Dict}=Vector{Dict}())
583+
state[end][:Tm] = eye(3)
584+
state[end][:Tlm] = eye(3)
585+
state[end][:Trm] = eye(3)
586+
evalContent!(pdo.group, state)
587+
delete!(state[end], :Tm)
588+
delete!(state[end], :Tlm)
589+
delete!(state[end], :Trm)
590+
return state
591+
end
484592

485-
function showtext(io::IO, pdo::PDPageMarkedContent, state::Vector{Dict})
593+
function evalContent!(pdo::PDPageMarkedContent, state::Vector{Dict})
486594
tag = pdo.group.objs[1].operands[1] # can be used for XML tagging.
487-
tag == cn"Artifact" && return io # Do not print Artifact types
488-
return showtext(io, pdo.group, state)
595+
if tag == cn"Artifact"
596+
state[end][:in_artifact] = true
597+
evalContent!(pdo.group, state)
598+
delete!(state[end], :in_artifact)
599+
return state
600+
end
601+
return evalContent!(pdo.group, state)
602+
end
603+
604+
evalContent!(pdo::PDPageElement{S}, state::Vector{Dict}) where S = state
605+
606+
function evalContent!(pdo::PDPageElement{:q}, state::Vector{Dict})
607+
push!(state, copy(state[end]))
608+
return state
609+
end
610+
611+
function evalContent!(pdo::PDPageElement{:Q}, state::Vector{Dict})
612+
pop!(state)
613+
return state
614+
end
615+
616+
function evalContent!(pdo::PDPageElement{:Tm}, state::Vector{Dict})
617+
a = get(pdo.operands[1])
618+
b = get(pdo.operands[2])
619+
c = get(pdo.operands[3])
620+
d = get(pdo.operands[4])
621+
e = get(pdo.operands[5])
622+
f = get(pdo.operands[6])
623+
tm = [a b 0.0; c d 0.0; e f 1.0]
624+
tlm = [a b 0.0; c d 0.0; e f 1.0]
625+
state[end][:Tm] = tm
626+
state[end][:Tlm] = tlm
627+
return state
489628
end
490629

491-
function showtext(io::IO, pdo::PDPageElement, state::Vector{Dict}=Vector{Dict}())
630+
function evalContent!(pdo::PDPageElement{:Tf}, state::Vector{Dict})
492631
page = get(state[end], :page, CosNull)
493-
page === CosNull && return io
494-
if pdo.t == :q
495-
push!(state, copy(state[end]))
496-
return io
497-
elseif pdo.t == :Q
498-
pop!(state)
499-
return io
500-
end
501-
pdo.t == Symbol("T*") && return print(io, "\n")
502-
(pdo.t == :Td || pdo.t == :TD) && get(pdo.operands[2]) > 0 &&
503-
return print(io, "\n")
504-
# If the previous text matrix was at a value higher than the current in y-axis
505-
# by 1-unit enter a line-break.
506-
if pdo.t == :Tm
507-
nyloc = get(pdo.operands[6])
508-
oyloc = -10000
509-
yloc = get(state[end], :yloc, Vector{Float32}())
510-
if length(yloc) != 0
511-
oyloc = yloc[end]
512-
end
513-
if (nyloc < oyloc - 1); print(io, '\n'); end
514-
push!(yloc, nyloc)
515-
return io
516-
end
517-
pdo.t != :Tf && return io
632+
page === CosNull && return state
518633
fontname = pdo.operands[1]
519634
font = page_find_font(page, fontname)
520-
font === CosNull && return io
635+
font === CosNull && return state
521636
state[end][:font] = (fontname, font)
522-
return io
637+
fontsize = get(pdo.operands[2])
638+
state[end][:fontsize] = fontsize
639+
return state
640+
end
641+
642+
for op in ["Tc", "Tw", "Tz", "TL", "Tr", "Ts"]
643+
@eval evalContent!(pdo::PDPageElement{Symbol($op)}, state::Vector{Dict}) =
644+
(state[end][Symbol($op)] = get(pdo.operands[1]); state)
645+
end
646+
647+
function set_text_pos!(tx, ty, state::Vector{Dict})
648+
tmul = [1.0 0.0 0.0; 0.0 1.0 0.0; tx ty 1.0]
649+
tlm = state[end][:Tlm]
650+
tm = tlm = tmul*tlm
651+
652+
state[end][:Tm] = tm
653+
state[end][:Tlm] = tlm
654+
return state
655+
end
656+
657+
function offset_text_leading!(state::Vector{Dict})
658+
tl = state[end][:TL]
659+
return set_text_pos!(0, -tl, state)
660+
end
661+
662+
function evalContent!(pdo::PDPageElement{:TD}, state::Vector{Dict})
663+
tx = get(pdo.operands[1])
664+
ty = get(pdo.operands[2])
665+
666+
state[end][:TL] = -ty
667+
set_text_pos!(tx, ty, state)
668+
end
669+
670+
function evalContent!(pdo::PDPageElement{:Td}, state::Vector{Dict})
671+
tx = get(pdo.operands[1])
672+
ty = get(pdo.operands[2])
673+
674+
set_text_pos!(tx, ty, state)
675+
end
676+
677+
evalContent!(pdo::PDPageElement{Symbol("T*")}, state::Vector{Dict}) =
678+
offset_text_leading!(state)
679+
680+
evalContent!(pdo::PDPageElement{Symbol("\'")}, state::Vector{Dict}) =
681+
offset_text_leading!(state)
682+
683+
function evalContent!(pdo::PDPageElement{Symbol("\"")}, state::Vector{Dict})
684+
state[end][:Tw] = get(pdo.operands[1])
685+
state[end][:Tc] = get(pdo.operands[2])
686+
offset_text_leading!(state)
523687
end
524688

525-
showtext(io::IO, pdo::PDPageInlineImage, state::Vector{Dict}=Vector{Dict}()) = io
689+
evalContent!(pdo::PDPageInlineImage, state::Vector{Dict}=Vector{Dict}()) = state
526690

527-
showtext(io::IO, pdo::CosObject, state::Vector{Dict}=Vector{Dict}()) = io
691+
evalContent!(pdo::CosObject, state::Vector{Dict}=Vector{Dict}()) = state

0 commit comments

Comments
 (0)