Skip to content

Commit 7585203

Browse files
author
Ole Liabø
committed
Support for journals and PDF with OCR data creation.
1 parent 01e5b63 commit 7585203

36 files changed

+1234
-189
lines changed

formats/epjark.conf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,16 @@ INFOVIEW_IMPORT_TYPE_REGEXP_NN=importFilnavn@filReferanse
4242
INFOVIEW_FOLDER_TYPE_REGEXP_EN=reportsFolder
4343
INFOVIEW_FOLDER_TYPE_REGEXP_NB=rapportMappe
4444
INFOVIEW_FOLDER_TYPE_REGEXP_NN=rapportMappe
45+
46+
; Journal nodes in info view. All journal nodes can have a set of attachments assosiated to them. Attachments can be
47+
; pages and OCR-data for the pages. Three parameters separated by 'at sign' (@):
48+
; 1. Node regexp
49+
; 2. Pages wildcard
50+
; 3. OCR wildcard
51+
INFO_VIEW_JOURNAL_TYPE_REGEXP=pasientjournal@%rootdir%/../objekter/????????_%fanearkidentifikator%_?_?.jpg@%rootdir%/../derivater/????????_%fanearkidentifikator%_?_?.xml
52+
53+
; Tool responsible for converting journal pages and optional ocr files into PDF document
54+
; %FILESFILE% - File containing pages to be included, one page per line: <page-image-file>;<page-ocr-file>
55+
; %OUT% - Create PDF filename
56+
; %TEMP% - Output folder
57+
JOURNAL_PDF_CREATOR_TOOL="./pdf/create-pdf.cmd %OUT% %FILESFILE% %TEMP%"

formats/nha-sip.conf

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,14 @@ INFOVIEW_DOCUMENT_TYPE_REGEXP=filename[^.*\.txt$]@filename[^.*\.xml$]@filename[^
3535

3636
; Load nodes in info view. All nodes matching the list of 'at sign' (@) separated regexps will get a 'Load'
3737
; button behind them.
38-
INFOVIEW_IMPORT_TYPE_REGEXP_EN=filename[^.*\.xml$]
39-
INFOVIEW_IMPORT_TYPE_REGEXP_NB=filename[^.*\.xml$]
40-
INFOVIEW_IMPORT_TYPE_REGEXP_NN=filename[^.*\.xml$]
38+
INFOVIEW_IMPORT_TYPE_REGEXP_EN=filename[^.*\.xml$]@filename[^.*\.hocr$]
39+
INFOVIEW_IMPORT_TYPE_REGEXP_NB=filename[^.*\.xml$]@filename[^.*\.hocr$]
40+
INFOVIEW_IMPORT_TYPE_REGEXP_NN=filename[^.*\.xml$]@filename[^.*\.hocr$]
41+
42+
; Auto load nodes will be loaded into tree view when importing XML
43+
INFOVIEW_AUTO_IMPORT_REGEXP_EN=filename[^.*avlxml\\.xml$]
44+
INFOVIEW_AUTO_IMPORT_REGEXP_NB=filename[^.*avlxml\\.xml$]
45+
INFOVIEW_AUTO_IMPORT_REGEXP_NN=filename[^.*avlxml\\.xml$]
4146

4247
; Folder nodes in info view. All nodes matching the list of 'at sign' (@) separated regexps will get a 'Open Folder'
4348
; button behind them.

insight.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ SEARCH_RESULT_MAX_COUNT=100
3333
;REPORTS_DIR="e:\\archivator\\innsyn\\rapporter"
3434
REPORTS_DIR=".\\rapporter"
3535

36+
; Export report folder - if a fixed folder is defined, UI will only allow to export to this
37+
; folder and below. If this folder is not defined, the last used folder will be default.
38+
; FIXED_REPORT_EXPORT_FOLDER=".\\eksport"
39+
3640
; Dataset to load at startup, useful for testing
3741
;STARTUP_LOAD_FILE=../arkivstruktur2.xml
3842
;STARTUP_LOAD_FILE=arkivstruktur2.xml

insight.pro

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ CONFIG += qt debug_and_release
1313
CONFIG += lrelease
1414
macx:CONFIG += app_bundle
1515
TARGET = insight
16-
DESTDIR = ./innsyn-v1.1.0
16+
DESTDIR = ./innsyn-v1.2.0
1717

1818
## SUPPORT DEBUG AND RELEASE BUILDS ##
1919
!debug_and_release|build_pass {
@@ -120,22 +120,26 @@ TOOLS_SOURCES = src/thirdparty/tools/src/derror.cpp \
120120

121121
GUI_SOURCES = src/gui/dinsightmainwindow.cpp \
122122
src/gui/dinsightreportwindow.cpp \
123+
src/gui/dinsightjournalwindow.cpp \
123124
src/gui/dtreeview.cpp \
124125
src/gui/dtreeitem.cpp \
125126
src/gui/dtreemodel.cpp \
126127
src/gui/dimport.cpp \
127128
src/gui/dwaitcursor.cpp \
128129
src/gui/qpersistantfiledialog.cpp \
129-
src/gui/qaboutdialog.cpp
130+
src/gui/qaboutdialog.cpp \
131+
src/gui/dfixedfolderdialog.cpp
130132
GUI_HEADERS = src/gui/dinsightmainwindow.h \
131133
src/gui/dinsightreportwindow.h \
134+
src/gui/dinsightjournalwindow.h \
132135
src/gui/dtreeview.h \
133136
src/gui/dtreeitem.h \
134137
src/gui/dtreemodel.h \
135138
src/gui/dimport.h \
136139
src/gui/dwaitcursor.h \
137140
src/gui/qpersistantfiledialog.h \
138-
src/gui/qaboutdialog.h
141+
src/gui/qaboutdialog.h \
142+
src/gui/dfixedfolderdialog.h
139143

140144
ZIP_HEADERS = src/thirdparty/quazip-0.7.3/quazip/crypt.h \
141145
src/thirdparty/quazip-0.7.3/quazip/ioapi.h \
@@ -169,7 +173,9 @@ ZIP_SOURCES = src/thirdparty/quazip-0.7.3/quazip/unzip.c \
169173
src/thirdparty/quazip-0.7.3/quazip/quazipnewinfo.cpp
170174

171175
FORMS = src/gui/dinsightmainwindow.ui \
176+
src/gui/dinsightjournalwindow.ui \
172177
src/gui/dinsightreportwindow.ui \
178+
src/gui/fixedfolderdialog.ui \
173179
src/gui/qaboutdialog.ui
174180

175181
RESOURCES = resources.qrc
@@ -189,6 +195,7 @@ SOURCES += src/main.cpp \
189195
src/dimportformat.cpp \
190196
src/dosxtools.cpp \
191197
src/dleafmatcher.cpp \
198+
src/djournalmatcher.cpp \
192199
$$GUI_SOURCES \
193200
$$ZIP_SOURCES \
194201
$$POSIXTAR_SOURCES
@@ -202,8 +209,10 @@ HEADERS += src/drunguard.h \
202209
src/dinsightconfig.h \
203210
src/dinsightreport.h \
204211
src/dimportformat.h \
212+
src/dpendingimport.h \
205213
src/dosxtools.h \
206214
src/dleafmatcher.h \
215+
src/djournalmatcher.h \
207216
$$GUI_HEADERS \
208217
$$ZIP_HEADERS \
209218
$$TOOLS_INCLUDES \

insight.sln

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,20 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "insight", "insight.vcxproj"
77
EndProject
88
Global
99
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|x64 = Debug|x64
1011
Debug|x86 = Debug|x86
12+
Release|x64 = Release|x64
1113
Release|x86 = Release|x86
1214
EndGlobalSection
1315
GlobalSection(ProjectConfigurationPlatforms) = postSolution
16+
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Debug|x64.ActiveCfg = Debug|x64
17+
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Debug|x64.Build.0 = Debug|x64
1418
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Debug|x86.ActiveCfg = Debug|x64
1519
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Debug|x86.Build.0 = Debug|x64
20+
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Release|x64.ActiveCfg = Release|x64
21+
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Release|x64.Build.0 = Release|x64
1622
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Release|x86.ActiveCfg = Release|x64
23+
{70A4233F-C4C2-3BB7-8203-2EA675BA8B91}.Release|x86.Build.0 = Release|x64
1724
EndGlobalSection
1825
GlobalSection(SolutionProperties) = preSolution
1926
HideSolutionNode = FALSE

lesmeg.org

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
#+TITLE:KDRS Innsyn v1.0.0
1+
#+TITLE:Piql Innsyn v1.2.0
22
#+AUTHOR:Piql AS
33
#+EMAIL:ole.liabo@piql.com
4-
#+DATE:2018.06.01
4+
#+DATE:2021.05.24
55
#+OPTIONS: ^:nil
6-
#+DESCRIPTION:KDRS Innsyn
6+
#+DESCRIPTION:Piql Innsyn
77
#+LANGUAGE: no
8-
#+CREATOR: Cooyright (c) 2020 <a href="http://www.piql.com">Piql AS</a>
8+
#+CREATOR: Cooyright (c) 2021 <a href="http://www.piql.com">Piql AS</a>
99
#+latex_header: \hypersetup{colorlinks=true,linkcolor=blue}
1010
# #+TOC: headlines 5
1111

@@ -42,7 +42,7 @@ Copyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)
4242

4343
* Installasjon
4444

45-
1. Pakk ut /innsyn-v1.1.0.zip/.
45+
1. Pakk ut /innsyn-v1.2.0.zip/.
4646
2. Indeksering av vedlegg kan bruke en del plass, rediger derfor eventuelt
4747
/REPORTS_DIR/ i /insight.conf/ til å peke på en katalog med mye ledig
4848
diskplass.
@@ -53,7 +53,7 @@ Copyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)
5353

5454
[[./screenshot.png]]
5555

56-
1. Kjør /insight.exe/ (mac/linux: insight) fra katalogen /innsyn-v1.1.0/.
56+
1. Kjør /insight.exe/ (mac/linux: insight) fra katalogen /innsyn-v1.2.0/.
5757
2. Brukergrensesnittet har fire hovedelementer:
5858
1) *Nodetre*: Viser alle elementene i uttrekket i en trestrukture etter en import.
5959
Hver node i treet tilsvarer en XML knagg. Nodetreet viser også tidligere
@@ -115,10 +115,27 @@ Etter import genereres en PDF-rapport i rapport katalogen, bestemt av
115115
formatet: /REPORTS_DIR\åååå\MM\DD\TTMMSS\/. I rapport katalogen
116116
legges også loggfiler fra indekseringen og indekserings databasen.
117117

118+
* Journaler
119+
120+
[[./journal.png]]
121+
122+
For noen XML strukturer kan det være en 1 til mange relasjon mellom en node i XMLen og filer i arkivpakken. Et eksempel på dette er Norsk Helsearkiv arkivpakker der avlxml filen kan referere til flere digitaliserte sider og tilhørende OCR metadata.
123+
Denne sammenhengen kan defineres i format filen med nøkkelen /INFO_VIEW_JOURNAL_TYPE_REGEXP/. Noder som treffer denne nøkkelen vi få en *Journal* knapp nederst i nodeinformasjon visningen.
124+
Journal visningsmoduset lar brukerene velge ut enkeltsider i journalen som skal eksporteres.
125+
126+
** Generering av PDF journaler med OCR
127+
128+
Journaler støtter visning og eksport av journaler som en /søkbar-PDF/, der hver side består av det digitaliserte bildet (feks en JPG) og den gjenskapte teksten (OCR) som et usynlig lag.
129+
Støttet OCR format er ALTO og HOCR. For mer informasjon se scriptet /pdf\create-pdf.cmd/. For å generere PDFer kreves det at flere verktøy er installert og tilgjengelig i søkestien:
130+
- python: For kjøring av [[https://github.com/piql/HocrConverter][HocrConverter.py]] som genererer søkbare PDFer.
131+
- [[http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/][pdftk]]: Slår sammen flere enkeltstående PDF sider til en PDF.
132+
- [[https://github.com/filak/hOCR-to-ALTO][alto__hocr.xsl]]: XSLT skjema for ALTO til hOCR konvertering
133+
134+
118135
* Loggfiler
119136

120-
- *innsyn-v1.1.0\insight.log*: Applikasjonslogg.
121-
- *innsyn-v1.1.0\insight.dmp*: Genereres hvis applikasjonen krasjer.
137+
- *innsyn-v1.2.0\insight.log*: Applikasjonslogg.
138+
- *innsyn-v1.2.0\insight.dmp*: Genereres hvis applikasjonen krasjer.
122139
- *REPORTS_DIR\YYYY\MM\DD\TTMMSS\attachments.log*: Logg over konvertering av vedlegg til søkbare tekstfiler.
123140
- *REPORTS_DIR\YYYY\MM\DD\TTMMSS\indexer.log*: Logg fra indekserings applikasjonen som genererer indekserings databasen.
124141
- *REPORTS_DIR\YYYY\MM\DD\TTMMSS\sphinx\test1\searchd.log*: Logg fra søkemotoren.
@@ -168,6 +185,11 @@ Ved feilrapportering legg ved loggfiler og ved programkrasj /insight.dmp/ hvis d
168185

169186
* Historikk
170187

188+
** innsyn-v1.2.0-beta1
189+
*** Nye funksjoner
190+
- Støtte for standard ut mappe ved eksport, settes med konfigurasjonsvariablen /FIXED_REPORT_EXPORT_FOLDER/.
191+
- Søtte for journaler
192+
171193
** 01.07.2020 innsyn-v1.1.0
172194
*** Feilrettinger
173195
- Opptegning av info nodeinformasjon: Vise all tekst for noder, og likt mellomrom mellom alle noder.

pdf/alto2hocr.cmd

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
@echo off
2+
setlocal
3+
set XSLT_TOOL="C:\Program Files\Saxonica\SaxonHE10.5N\bin\transform"
4+
set ALTO2HOCR_XSLT=alto__hocr.xsl
5+
6+
if "%2" NEQ "" SET OUTHOCR="-o:%2"
7+
8+
%XSLT_TOOL% -s:%1 -xsl:%ALTO2HOCR_XSLT% %OUTHOCR%
9+
10+
11+

pdf/alto2pdf.cmd

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
@echo off
2+
setlocal
3+
set IN_JPG=%1
4+
set IN_ALTO=%2
5+
set OUT=%3
6+
7+
rem set DEBUG_PARAMS=-b -v -t
8+
9+
alto2hocr.cmd %IN_ALTO% | python .\HocrConverter\HocrConverter.py -i - -I -V -n -o %OUT% %DEBUG_PARAMS% %IN_JPG%

0 commit comments

Comments
 (0)