Skip to content

Commit a35476b

Browse files
authored
Merge pull request #18 from CopticScriptorium/dev
Dev
2 parents 96e9077 + 5705743 commit a35476b

File tree

6 files changed

+121
-22
lines changed

6 files changed

+121
-22
lines changed

api.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
#!/usr/bin/python
1+
#!/usr/local/bin/python2.7
22
# -*- coding: utf-8 -*-
33

4-
from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
4+
#from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
55

66
#Example call on localhost:
77
#http://localhost/coptic-nlp/api.py?data=%E2%B2%81%CF%A5%E2%B2%A5%E2%B2%B1%E2%B2%A7%E2%B2%99%20%E2%B2%9B%CF%AD%E2%B2%93%E2%B2%A1%E2%B2%A3%E2%B2%B1%E2%B2%99%E2%B2%89&lb=line
@@ -17,7 +17,10 @@
1717
if "lb" in storage:
1818
line = storage.getvalue("lb")
1919
else:
20-
line = "noline"
20+
if "<lb" in data:
21+
line = "noline"
22+
else:
23+
line = "line"
2124

2225
if "format" in storage:
2326
format = storage.getvalue("format")
@@ -29,20 +32,22 @@
2932

3033
if format == "pipes":
3134
print("Content-Type: text/plain; charset=UTF-8\n")
32-
processed = nlp_coptic(data,line,sgml_mode="pipes",do_tok=True)
35+
processed = nlp_coptic(data,lb=line=="line",sgml_mode="pipes",do_tok=True)
36+
if "</lb>" in processed:
37+
processed = processed.replace("</lb>","</lb>\n")
3338
print(processed.strip())
3439
elif format == "sgml_no_parse":
3540
print("Content-Type: text/sgml; charset=UTF-8\n")
3641
# secure call, note that htaccess prevents this running without authentication
3742
if "|" in data:
3843
processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
3944
do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
40-
do_milestone=True, do_parse=True, sgml_mode="sgml",
45+
do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
4146
tok_mode="from_pipes", old_tokenizer=False)
4247
else:
4348
processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
4449
do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
45-
do_milestone=True, do_parse=True, sgml_mode="sgml",
50+
do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
4651
tok_mode="auto", old_tokenizer=False)
4752
print(processed.strip() + "\n")
4853
elif format != "conll":

coptic_nlp.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ def download_requirements(tt_ok=True, malt_ok=True):
390390

391391
def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True, do_mwe=True, do_tag=True, do_lemma=True, do_lang=True,
392392
do_milestone=True, do_parse=True, sgml_mode="sgml", tok_mode="auto", old_tokenizer=False, sent_tag=None,
393-
preloaded=None, pos_spans=False, merge_parse=False, detokenize=0):
393+
preloaded=None, pos_spans=False, merge_parse=False, detokenize=0, segment_merged=False, gold_parse=""):
394394

395395
data = input_data.replace("\t","")
396396
data = data.replace("\r","")
@@ -399,7 +399,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
399399
stk = preloaded
400400
else:
401401
stk = StackedTokenizer(pipes=sgml_mode != "sgml", lines=lb, tokenized=tok_mode=="from_pipes",
402-
detok=detokenize, segment_merged=opts.segment_merged)
402+
detok=detokenize, segment_merged=segment_merged)
403403

404404
if do_milestone:
405405
data = binarize(data)
@@ -453,7 +453,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
453453
do_tag = True
454454
elif resp.lower() == "a":
455455
sys.exit(0)
456-
if do_tag:
456+
if do_tag and not pos_spans:
457457
tag = [tt_path+'tree-tagger', tt_path+'coptic_fine.par', '-token','-lemma','-no-unknown', '-sgml' ,'tempfilename'] #no -token
458458
tagged = exec_via_temp(norms,tag)
459459
tagged = re.sub('\r','',tagged)
@@ -464,13 +464,20 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
464464
tagged = input_data
465465
if PY3:
466466
tagged = input_data.encode("utf8") # Handle non-UTF-8 when calling TT from subprocess in Python 3
467-
conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True) # NB element is present it supercedes the POS tag
468-
deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
469-
depedited = deped.run_depedit(conllized.split("\n"))
470-
parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse']
471-
parsed = exec_via_temp(depedited,parse_coptic,parser_path)
472-
deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
473-
depedited = deped.run_depedit(parsed.split("\n"))
467+
if gold_parse == "":
468+
conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True) # NB element is present it supercedes the POS tag
469+
deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
470+
depedited = deped.run_depedit(conllized.split("\n"))
471+
parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse']
472+
parsed = exec_via_temp(depedited,parse_coptic,parser_path)
473+
deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
474+
depedited = deped.run_depedit(parsed.split("\n"))
475+
else: # A cached gold parse has been specified
476+
depedited = gold_parse
477+
norm_count = len(re.findall(r'(\n|^)[0-9]+\t',depedited))
478+
input_norms = input_data.count(" norm=")
479+
if norm_count != input_norms:
480+
raise IOError("Mismatch in word count: " + str(norm_count) + " in gold parse but " + str(input_norms) + " in SGML file\n")
474481
if parse_only: # Output parse in conll format
475482
return depedited
476483
elif merge_parse: # Insert parse into input SGML as attributes of <norm>
@@ -718,7 +725,8 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
718725
do_norm=opts.norm, do_mwe=opts.multiword, do_tag=opts.tag, do_lemma=opts.lemma,
719726
do_lang=opts.etym, do_milestone=opts.unary, do_parse=opts.parse, sgml_mode=opts.outmode,
720727
tok_mode="auto", old_tokenizer=old_tokenizer, sent_tag=opts.sent, preloaded=stk,
721-
pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize)
728+
pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize,
729+
segment_merged=opts.segment_merged)
722730

723731
if opts.outmode == "sgml":
724732
processed = reorder(processed.strip().split("\n"),add_fixed_meta=add_fixed_meta)

index.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import cgitb
55
cgitb.enable()
66

7-
from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
87
from nlp_form import make_nlp_form
98

109
print("Content-Type: text/html\n\n\n")

lib/stacked_tokenizer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
sys.path.append("lib")
3434

35-
class BoundGroup():
35+
class BoundGroup:
3636

3737
# Static list of characters that are removed from norm text (basic normalization)
3838
orig_chars = ["̈", "", "̄", "̀", "̣", "`", "̅", "̈", "̂", "︤", "︥", "︦", "⳿", "~", "\n", "[", "]", "̇", "᷍"]
@@ -324,6 +324,12 @@ def serialize(groups,pipes=False,group_sep="_",tok_sep="|",segment_merged=False)
324324
return out_text
325325

326326

327+
def adjust_theta(tokenization):
328+
"""Post-edit pre-tokenization in 'from pipes' mode to account for theta boundaries"""
329+
tokenization = tokenization.replace("ⲑ|","ⲧ|ϩ").replace("ⲑ-","ⲧ-ϩ")
330+
return tokenization
331+
332+
327333
class StackedTokenizer:
328334

329335
def __init__(self,lines=False,pipes=False,tokenized=False,no_morphs=False,detok=0,segment_merged=False,model="cop"):
@@ -426,6 +432,7 @@ def analyze(self,data):
426432
for g in grps:
427433
# plain_tokenization = g.norm.replace("□","|").replace("■","-")
428434
plain_tokenization = g.pretokenization
435+
plain_tokenization = adjust_theta(plain_tokenization)
429436
g.orig = g.orig.replace("□", "").replace("■", "")
430437
g.norm = g.norm.replace("□", "").replace("■", "")
431438
# g.dirty = g.dirty.replace("□","").replace("■","")

nlp_form.html

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,51 @@ <h3 class="nlp_title">Input:</h3>
4040
<br/>
4141
<h3 class="nlp_title">Output:</h3>
4242
<table>
43-
<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" name="old_tok" value="old_tok"**old_checked**>Use old finite state tokenizer
43+
<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" id="old_tok" name="old_tok" value="old_tok" onclick="toggle_laytonize(false);"**old_checked**>Use old finite state tokenizer
4444
<a href="#" class="tooltip2">
4545
<i class="fa fa-info-circle" style="display: inline-block"></i>
4646
<span>
4747
<img class="callout" src="img/callout.gif" />
48-
Less accurate, provided for reproducing older results.<br/>
48+
Less accurate, provided for reproducing older results. Not compatible with detokenization.<br/>
4949
</span>
5050
</a></input><br/></td></tr>
51+
<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" id="detokenize" name="detokenize" value="detokenize" onclick="toggle_laytonize(true);"**detokenize_checked**>Re-merge bound groups
52+
<a href="#" class="tooltip2">
53+
<i class="fa fa-info-circle" style="display: inline-block"></i>
54+
<span>
55+
<img class="callout" src="img/callout.gif" />
56+
Regularizes bound group spaces if input does not follow Layton's guidelines<br/>
57+
(a.k.a. 'Laytonization'; increases accuracy on Till-segmented text and OCR)
58+
</span>
59+
</a></input><br/>
60+
<ul>
61+
<input type="radio" id="laytonize1" name="laytonize" value="conservative"**laytonize_conservative_checked**>Conservative merging<a href="#" class="tooltip2">
62+
<i class="fa fa-info-circle" style="display: inline-block"></i>
63+
<span>
64+
<img class="callout" src="img/callout.gif" />
65+
Only re-bind items known to appear unbound in other segmentations <br/>(e.g. well edited text following Till)<br/>
66+
<div style="font-family: Antinoou; text-align:right; margin-bottom: 0px; font-weight: bold"><br/>ϩⲙ ⲡⲏⲓ --&gt; ϩⲙ|ⲡ|ⲏⲓ</div>
67+
</span>
68+
</a></input><br/>
69+
<input type="radio" id="laytonize2" name="laytonize" value="aggressive"**laytonize_aggressive_checked**>Aggressive merging<a href="#" class="tooltip2">
70+
<i class="fa fa-info-circle" style="display: inline-block"></i>
71+
<span>
72+
<img class="callout" src="img/callout.gif" />
73+
Re-bind all items that are unlikely to appear unbound <br/>(better for messy data/OCR)<br/>
74+
<div style="font-family: Antinoou; text-align:right; margin-bottom: 0px; font-weight: bold"><br/>ⲁ ϥⲥⲱⲧⲙ --&gt; ⲁ|ϥ|ⲥⲱⲧⲙ</div>
75+
</span>
76+
</a></input><br/>
77+
<input type="checkbox" id="segment_merged" name="segment_merged" value="segment_merged"**segment_merged_checked**>Segment at merge point
78+
<a href="#" class="tooltip2">
79+
<i class="fa fa-info-circle" style="display: inline-block"></i>
80+
<span>
81+
<img class="callout" src="img/callout.gif" />
82+
If bound groups are merged, assume a morpheme boundary <br/>
83+
(recommended if base segmentation is reliable)
84+
</span>
85+
</a></input><br/>
86+
</ul>
87+
</td></tr>
5188
<tr><td>
5289
<input type="radio" name="sgml_mode" value="sgml" onclick="disable_checkboxes(false);"**sgml_checked**>SGML pipeline</input><br/>
5390
<ul>
@@ -160,6 +197,31 @@ <h3 class="nlp_title">Output:</h3>
160197
if (document.querySelector('input[name="sgml_mode"]:checked').value == "pipes"){
161198
disable_checkboxes(true);
162199
}
200+
201+
function toggle_laytonize(laytonize_on){
202+
if (laytonize_on){
203+
document.getElementById("old_tok").checked = false;
204+
}
205+
else{
206+
document.getElementById("detokenize").checked = false;
207+
}
208+
document.getElementById("norm").disabled = laytonize_on;
209+
if (document.getElementById("detokenize").checked){
210+
document.getElementById("laytonize1").disabled = false;
211+
document.getElementById("laytonize1").checked = true;
212+
document.getElementById("laytonize2").disabled = false;
213+
document.getElementById("segment_merged").disabled = false;
214+
document.getElementById("segment_merged").checked = true;
215+
}
216+
else{
217+
document.getElementById("laytonize1").disabled = true;
218+
document.getElementById("laytonize1").checked = false;
219+
document.getElementById("laytonize2").disabled = true;
220+
document.getElementById("laytonize2").checked = false;
221+
document.getElementById("segment_merged").disabled = true;
222+
document.getElementById("segment_merged").checked = false;
223+
}
224+
}
163225
</script>
164226
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
165227
<script src="https://cdn.jsdelivr.net/bxslider/4.2.12/jquery.bxslider.min.js"></script>

nlp_form.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ def make_nlp_form(access_level, mode):
6666
do_lemma = True
6767
do_tag = True
6868
do_parse = True
69+
detok = 0
70+
segment_merged = False
6971
do_tok = True
7072
do_norm = True
7173
do_mwe = True
@@ -85,6 +87,13 @@ def make_nlp_form(access_level, mode):
8587
do_parse = form.getvalue("parse") is not None
8688
do_norm = form.getvalue("norm") is not None
8789
do_mwe = form.getvalue("mwe") is not None
90+
if form.getvalue("laytonize") == "aggressive":
91+
detok = 2
92+
elif form.getvalue("laytonize") == "conservative":
93+
detok = 1
94+
else:
95+
detok = 0
96+
segment_merged = form.getvalue("segment_merged") is not None
8897
do_tok = form.getvalue("tok") is not None
8998
do_lang = form.getvalue("lang") is not None
9099
if sgml_mode == "pipes":
@@ -95,7 +104,8 @@ def make_nlp_form(access_level, mode):
95104
else:
96105
processed = nlp_coptic(data,lb=lb=="line",parse_only=False,do_tok=do_tok,do_norm=do_norm,do_mwe=do_mwe,
97106
do_tag=do_tag, do_lemma=do_lemma,do_lang=do_lang,do_milestone=do_milestone,
98-
do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok)
107+
do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok,
108+
detokenize=detok, segment_merged=segment_merged)
99109
processed = processed.strip()
100110

101111
###
@@ -115,6 +125,10 @@ def make_nlp_form(access_level, mode):
115125
noline_checked = ' checked="checked"' if lb else ""
116126
tok_checked = ' checked="checked"' if do_tok else ""
117127
old_checked = ' checked="checked"' if old_tok else ""
128+
segment_merged_checked = ' checked="checked"' if segment_merged else ""
129+
detokenize_checked = ' checked="checked"' if detok > 0 else ""
130+
laytonize_conservative_checked = ' checked="checked"' if detok == 1 else ""
131+
laytonize_aggressive_checked = ' checked="checked"' if detok == 2 else ""
118132
auto_checked = ' checked="checked"' if tok_mode == "auto" else ""
119133
pipes_checked = ' checked="checked"' if tok_mode == "from_pipes" else ""
120134
norm_checked = ' checked="checked"' if do_norm else ""
@@ -154,6 +168,10 @@ def make_nlp_form(access_level, mode):
154168
template = template.replace("**old_checked**", old_checked)
155169
template = template.replace("**milestone_checked**", milestone_checked)
156170
template = template.replace("**tok_checked**", tok_checked)
171+
template = template.replace("**detokenize_checked**", detokenize_checked)
172+
template = template.replace("**laytonize_conservative_checked**", laytonize_conservative_checked)
173+
template = template.replace("**laytonize_aggressive_checked**", laytonize_aggressive_checked)
174+
template = template.replace("**segment_merged_checked**", segment_merged_checked)
157175
template = template.replace("**auto_checked**", auto_checked)
158176
template = template.replace("**pipes_checked**", pipes_checked)
159177
template = template.replace("**norm_checked**", norm_checked)

0 commit comments

Comments
 (0)