Skip to content

Commit 6adcef0

Browse files
committed
Modified test cases
1. Fix for CMap related file handling is issues. 2. Added graphics set support for font selection 3. Inferring LF from text matrix on Tm. 4. Removed LF on P type BDC or BMC in tagged PDF. 5. Removed /Artifact typed tags from text extraction
1 parent 13e448d commit 6adcef0

File tree

14 files changed

+475
-41
lines changed

14 files changed

+475
-41
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
*.jl.cov
22
*.jl.*.cov
33
*.jl.mem
4+
*.res
45
docs/build/
56
docs/site/
67
test/downloads
78
test/*.pdf
9+
test/*.res
810
test/pvt
911
file.txt

A1947-15.txt

-4.55 KB
Binary file not shown.

src/BufferParser.jl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ export skipv,
1818
error("Found '$(UInt8(ch))' Expected '$(Char(c))' here")
1919
end
2020

21-
@inline skipv(ps::BufferedInputStream, cs::UInt8...) = for c in cs skipv(ps, c) end
22-
2321
@inline skipv(ps::BufferedInputStream, cs::Vector{UInt8}) = for c in cs skipv(ps, c) end
2422

2523
@inline advance!(ps::BufferedInputStream) = read(ps,UInt8)

src/PDFonts.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,22 +113,25 @@ function get_encoded_string(s::CosString, fum::FontUnicodeMapping)
113113
carr = NativeEncodingToUnicode(Vector{UInt8}(s), fum.encoding)
114114
return String(carr)
115115
end
116+
# {UInt8, CosObject}
116117

117-
function get_unicode_chars(b::UInt8, itv::IntervalValue{UInt8, CosObject})
118+
function get_unicode_chars(b::UInt8, itv::IntervalValue)
118119
f = first(itv)
119120
l = last(itv)
120121
v = value(itv)
121122
if v isa CosXString
122123
bytes = Vector{UInt8}(v)
123124
carr = get_unicode_chars(bytes)
124125
carr[1] += (b - f) # Only one char should be generated here
125-
else
126+
elseif v isa CosArray
126127
@assert v isa CosArray
127128
arr = get(v)
128129
xstr = arr[b - f + 1]
129130
@assert xstr isa CosXString
130131
bytes = Vector{UInt8}(xstr)
131132
carr = get_unicode_chars(bytes)
133+
else
134+
@assert 1 == 0
132135
end
133136
return carr
134137
end

src/PDPage.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,11 @@ For PDFs not tagged, some line and word breaks will not be extracted properly.
6969
"""
7070
function pdPageExtractText(io::IO, page::PDPage)
7171
# page.doc.isTagged != :tagged && throw(ErrorException(E_NOT_TAGGED_PDF))
72-
state = Dict()
73-
state[:page] = page
72+
state = Vector{Dict}()
73+
yloc = Vector{Float32}()
74+
push!(state, Dict())
75+
state[end][:page] = page
76+
state[end][:yloc] = yloc
7477
showtext(io, pdPageGetContentObjects(page), state)
7578
return io
7679
end

src/PDPageElement.jl

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -457,48 +457,69 @@ const PD_CONTENT_OPERATORS = Dict(
457457

458458
function get_pdfcontentops(b::Vector{UInt8})
459459
arr = get(PD_CONTENT_OPERATORS, String(b), CosNull)
460-
if (arr == CosNull)
461-
return CosNull
462-
else
463-
return eval(Expr(:call,arr...))
464-
end
460+
(arr == CosNull) && return CosNull
461+
return eval(Expr(:call,arr...))
465462
end
466463

467-
function showtext(io::IO, grp::PDPageObjectGroup, state::Dict=Dict())
464+
function showtext(io::IO, grp::PDPageObjectGroup, state::Vector{Dict}=Vector{Dict}())
468465
for obj in grp.objs
469466
showtext(io, obj, state)
470467
end
471468
return io
472469
end
473470

474-
function showtext(io::IO, tr::PDPageTextRun, state::Dict=Dict())
475-
fontname, font = get(state, :font, (CosNull, CosNull))
476-
page = get(state, :page, CosNull)
471+
function showtext(io::IO, tr::PDPageTextRun, state::Vector{Dict}=Vector{Dict}())
472+
fontname, font = get(state[end], :font, (CosNull, CosNull))
473+
page = get(state[end], :page, CosNull)
474+
(tr.elem.t == Symbol("\'") || tr.elem.t == Symbol("\"")) && print(io, " LF\n")
477475
for s in tr.ss
478-
text = get_encoded_string(s, fontname, page)
476+
text = String(get_encoded_string(s, fontname, page))
479477
write(io, text)
480478
end
481479
return io
482480
end
483481

484-
showtext(io::IO, pdo::PDPageTextObject, state::Dict=Dict()) = showtext(io, pdo.group, state)
482+
showtext(io::IO, pdo::PDPageTextObject, state::Vector{Dict}=Vector{Dict}()) =
483+
showtext(io, pdo.group, state)
485484

486-
function showtext(io::IO, pdo::PDPageMarkedContent, state::Dict)
485+
function showtext(io::IO, pdo::PDPageMarkedContent, state::Vector{Dict})
487486
tag = pdo.group.objs[1].operands[1] # can be used for XML tagging.
488-
showtext(io, pdo.group, state)
489-
print(io, '\n')
490-
return io
487+
tag == cn"Artifact" && return io # Do not print Artifact types
488+
return showtext(io, pdo.group, state)
491489
end
492490

493-
function showtext(io::IO, pdo::PDPageElement, state::Dict=Dict())
494-
page = get(state, :page, CosName)
491+
function showtext(io::IO, pdo::PDPageElement, state::Vector{Dict}=Vector{Dict}())
492+
page = get(state[end], :page, CosNull)
495493
page === CosNull && return io
494+
if pdo.t == :q
495+
push!(state, copy(state[end]))
496+
return io
497+
elseif pdo.t == :Q
498+
pop!(state)
499+
return io
500+
end
501+
pdo.t == Symbol("T*") && return print(io, "\n")
502+
(pdo.t == :Td || pdo.t == :TD) && get(pdo.operands[2]) > 0 &&
503+
return print(io, "\n")
504+
# If the previous text matrix was at a value higher than the current in y-axis
505+
# by 1-unit enter a line-break.
506+
if pdo.t == :Tm
507+
nyloc = get(pdo.operands[6])
508+
oyloc = -10000
509+
yloc = get(state[end], :yloc, Vector{Float32}())
510+
if length(yloc) != 0
511+
oyloc = yloc[end]
512+
end
513+
if (nyloc < oyloc - 1); print(io, '\n'); end
514+
push!(yloc, nyloc)
515+
return io
516+
end
496517
pdo.t != :Tf && return io
497518
fontname = pdo.operands[1]
498519
font = page_find_font(page, fontname)
499520
font === CosNull && return io
500-
state[:font] = (fontname, font)
521+
state[end][:font] = (fontname, font)
501522
return io
502523
end
503524

504-
showtext(io::IO, pdo::CosObject, state::Dict=Dict()) = (show(io, pdo); io)
525+
showtext(io::IO, pdo::CosObject, state::Vector{Dict}=Vector{Dict}()) = (show(io, pdo); io)

0 commit comments

Comments
 (0)