Merge pull request #18 from CopticScriptorium/dev

amir-zeldes · web-flow · commit a35476b68927 · 2019-06-17T10:05:10.000-04:00
Dev
diff --git a/api.py b/api.py
@@ -1,7 +1,7 @@
-#!/usr/bin/python
+#!/usr/local/bin/python2.7
 # -*- coding: utf-8 -*-
 
-from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
+#from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
 
 #Example call on localhost:
 #http://localhost/coptic-nlp/api.py?data=%E2%B2%81%CF%A5%E2%B2%A5%E2%B2%B1%E2%B2%A7%E2%B2%99%20%E2%B2%9B%CF%AD%E2%B2%93%E2%B2%A1%E2%B2%A3%E2%B2%B1%E2%B2%99%E2%B2%89&lb=line
@@ -17,7 +17,10 @@
 if "lb" in storage:
 	line = storage.getvalue("lb")
 else:
-	line = "noline"
+	if "<lb" in data:
+		line = "noline"
+	else:
+		line = "line"
 
 if "format" in storage:
 	format = storage.getvalue("format")
@@ -29,20 +32,22 @@
 
 if format == "pipes":
 	print("Content-Type: text/plain; charset=UTF-8\n")
-	processed = nlp_coptic(data,line,sgml_mode="pipes",do_tok=True)
+	processed = nlp_coptic(data,lb=line=="line",sgml_mode="pipes",do_tok=True)
+	if "</lb>" in processed:
+		processed = processed.replace("</lb>","</lb>\n")
 	print(processed.strip())
 elif format == "sgml_no_parse":
 	print("Content-Type: text/sgml; charset=UTF-8\n")
 	# secure call, note that htaccess prevents this running without authentication
 	if "|" in data:
 		processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
 							   do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
-							   do_milestone=True, do_parse=True, sgml_mode="sgml",
+							   do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
 							   tok_mode="from_pipes", old_tokenizer=False)
 	else:
 		processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
 							   do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
-							   do_milestone=True, do_parse=True, sgml_mode="sgml",
+							   do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
 							   tok_mode="auto", old_tokenizer=False)
 	print(processed.strip() + "\n")
 elif format != "conll":
diff --git a/coptic_nlp.py b/coptic_nlp.py
@@ -390,7 +390,7 @@ def download_requirements(tt_ok=True, malt_ok=True):
 
 def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True, do_mwe=True, do_tag=True, do_lemma=True, do_lang=True,
 			   do_milestone=True, do_parse=True, sgml_mode="sgml", tok_mode="auto", old_tokenizer=False, sent_tag=None,
-			   preloaded=None, pos_spans=False, merge_parse=False, detokenize=0):
+			   preloaded=None, pos_spans=False, merge_parse=False, detokenize=0, segment_merged=False, gold_parse=""):
 
 	data = input_data.replace("\t","")
 	data = data.replace("\r","")
@@ -399,7 +399,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
 		stk = preloaded
 	else:
 		stk = StackedTokenizer(pipes=sgml_mode != "sgml", lines=lb, tokenized=tok_mode=="from_pipes",
-							   detok=detokenize, segment_merged=opts.segment_merged)
+							   detok=detokenize, segment_merged=segment_merged)
 
 	if do_milestone:
 		data = binarize(data)
@@ -453,7 +453,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
 					do_tag = True
 				elif resp.lower() == "a":
 					sys.exit(0)
-		if do_tag:
+		if do_tag and not pos_spans:
 			tag = [tt_path+'tree-tagger', tt_path+'coptic_fine.par', '-token','-lemma','-no-unknown', '-sgml' ,'tempfilename'] #no -token
 			tagged = exec_via_temp(norms,tag)
 			tagged = re.sub('\r','',tagged)
@@ -464,13 +464,20 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
 				tagged = input_data
 				if PY3:
 					tagged = input_data.encode("utf8")  # Handle non-UTF-8 when calling TT from subprocess in Python 3
-		conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True)  # NB element is present it supercedes the POS tag
-		deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
-		depedited = deped.run_depedit(conllized.split("\n"))
-		parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse']
-		parsed = exec_via_temp(depedited,parse_coptic,parser_path)
-		deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
-		depedited = deped.run_depedit(parsed.split("\n"))
+		if gold_parse == "":
+			conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True)  # NB element is present it supercedes the POS tag
+			deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
+			depedited = deped.run_depedit(conllized.split("\n"))
+			parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse']
+			parsed = exec_via_temp(depedited,parse_coptic,parser_path)
+			deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
+			depedited = deped.run_depedit(parsed.split("\n"))
+		else:  # A cached gold parse has been specified
+			depedited = gold_parse
+			norm_count = len(re.findall(r'(\n|^)[0-9]+\t',depedited))
+			input_norms = input_data.count(" norm=")
+			if norm_count != input_norms:
+				raise IOError("Mismatch in word count: " + str(norm_count) + " in gold parse but " + str(input_norms) + " in SGML file\n")
 		if parse_only:  # Output parse in conll format
 			return depedited
 		elif merge_parse:  # Insert parse into input SGML as attributes of <norm>
@@ -718,7 +725,8 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
 							   do_norm=opts.norm, do_mwe=opts.multiword, do_tag=opts.tag, do_lemma=opts.lemma,
 							   do_lang=opts.etym, do_milestone=opts.unary, do_parse=opts.parse, sgml_mode=opts.outmode,
 							   tok_mode="auto", old_tokenizer=old_tokenizer, sent_tag=opts.sent, preloaded=stk,
-							   pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize)
+							   pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize,
+							   segment_merged=opts.segment_merged)
 
 		if opts.outmode == "sgml":
 			processed = reorder(processed.strip().split("\n"),add_fixed_meta=add_fixed_meta)
diff --git a/index.py b/index.py
@@ -4,7 +4,6 @@
 import cgitb
 cgitb.enable()
 
-from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
 from nlp_form import make_nlp_form
 
 print("Content-Type: text/html\n\n\n")
diff --git a/lib/stacked_tokenizer.py b/lib/stacked_tokenizer.py
@@ -32,7 +32,7 @@
 
 sys.path.append("lib")
 
-class BoundGroup():
+class BoundGroup:
 
 	# Static list of characters that are removed from norm text (basic normalization)
 	orig_chars = ["̈", "", "̄", "̀", "̣", "`", "̅", "̈", "̂", "︤", "︥", "︦", "⳿", "~", "\n", "[", "]", "̇", "᷍"]
@@ -324,6 +324,12 @@ def serialize(groups,pipes=False,group_sep="_",tok_sep="|",segment_merged=False)
 	return out_text
 
 
+def adjust_theta(tokenization):
+	"""Post-edit pre-tokenization in 'from pipes' mode to account for theta boundaries"""
+	tokenization = tokenization.replace("ⲑ|","ⲧ|ϩ").replace("ⲑ-","ⲧ-ϩ")
+	return tokenization
+
+
 class StackedTokenizer:
 
 	def __init__(self,lines=False,pipes=False,tokenized=False,no_morphs=False,detok=0,segment_merged=False,model="cop"):
@@ -426,6 +432,7 @@ def analyze(self,data):
 			for g in grps:
 				# plain_tokenization = g.norm.replace("□","|").replace("■","-")
 				plain_tokenization = g.pretokenization
+				plain_tokenization = adjust_theta(plain_tokenization)
 				g.orig = g.orig.replace("□", "").replace("■", "")
 				g.norm = g.norm.replace("□", "").replace("■", "")
 				# g.dirty = g.dirty.replace("□","").replace("■","")
diff --git a/nlp_form.html b/nlp_form.html
@@ -40,14 +40,51 @@ <h3 class="nlp_title">Input:</h3>
 			<br/>
 			<h3 class="nlp_title">Output:</h3>
 			<table>
-			<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" name="old_tok" value="old_tok"**old_checked**>Use old finite state tokenizer
+			<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" id="old_tok" name="old_tok" value="old_tok" onclick="toggle_laytonize(false);"**old_checked**>Use old finite state tokenizer
 			<a href="#" class="tooltip2">
 							<i class="fa fa-info-circle" style="display: inline-block"></i>
 							<span>
 								<img class="callout" src="img/callout.gif" />
-								Less accurate, provided for reproducing older results.<br/>
+								Less accurate, provided for reproducing older results. Not compatible with detokenization.<br/>
 							</span>
 						</a></input><br/></td></tr>
+			<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" id="detokenize" name="detokenize" value="detokenize" onclick="toggle_laytonize(true);"**detokenize_checked**>Re-merge bound groups
+			<a href="#" class="tooltip2">
+							<i class="fa fa-info-circle" style="display: inline-block"></i>
+							<span>
+								<img class="callout" src="img/callout.gif" />
+								Regularizes bound group spaces if input does not follow Layton's guidelines<br/>
+								(a.k.a. 'Laytonization'; increases accuracy on Till-segmented text and OCR)
+							</span>
+						</a></input><br/>
+						<ul>
+							<input type="radio" id="laytonize1" name="laytonize" value="conservative"**laytonize_conservative_checked**>Conservative merging<a href="#" class="tooltip2">
+								<i class="fa fa-info-circle" style="display: inline-block"></i>
+								<span>
+									<img class="callout" src="img/callout.gif" />
+									Only re-bind items known to appear unbound in other segmentations <br/>(e.g. well edited text following Till)<br/>
+									<div style="font-family: Antinoou; text-align:right; margin-bottom: 0px; font-weight: bold"><br/>ϩⲙ ⲡⲏⲓ --&gt; ϩⲙ|ⲡ|ⲏⲓ</div>
+								</span>
+							</a></input><br/>
+							<input type="radio"  id="laytonize2" name="laytonize" value="aggressive"**laytonize_aggressive_checked**>Aggressive merging<a href="#" class="tooltip2">
+								<i class="fa fa-info-circle" style="display: inline-block"></i>
+								<span>
+									<img class="callout" src="img/callout.gif" />
+									Re-bind all items that are unlikely to appear unbound <br/>(better for messy data/OCR)<br/>
+									<div style="font-family: Antinoou; text-align:right; margin-bottom: 0px; font-weight: bold"><br/>ⲁ ϥⲥⲱⲧⲙ --&gt; ⲁ|ϥ|ⲥⲱⲧⲙ</div>
+								</span>
+							</a></input><br/>
+							<input type="checkbox" id="segment_merged" name="segment_merged" value="segment_merged"**segment_merged_checked**>Segment at merge point
+								<a href="#" class="tooltip2">
+								<i class="fa fa-info-circle" style="display: inline-block"></i>
+								<span>
+									<img class="callout" src="img/callout.gif" />
+									If bound groups are merged, assume a morpheme boundary <br/>
+									(recommended if base segmentation is reliable)
+								</span>
+							</a></input><br/>
+						</ul>
+						</td></tr>
 			<tr><td>
 				<input type="radio" name="sgml_mode" value="sgml" onclick="disable_checkboxes(false);"**sgml_checked**>SGML pipeline</input><br/>
 				<ul>
@@ -160,6 +197,31 @@ <h3 class="nlp_title">Output:</h3>
 		if (document.querySelector('input[name="sgml_mode"]:checked').value == "pipes"){
 			disable_checkboxes(true);
 		}
+		
+		function toggle_laytonize(laytonize_on){
+			if (laytonize_on){
+				document.getElementById("old_tok").checked = false;
+			}
+			else{
+				document.getElementById("detokenize").checked = false;
+			}
+			document.getElementById("norm").disabled = laytonize_on;
+			if (document.getElementById("detokenize").checked){
+				document.getElementById("laytonize1").disabled = false;
+				document.getElementById("laytonize1").checked = true;
+				document.getElementById("laytonize2").disabled = false;
+				document.getElementById("segment_merged").disabled = false;
+				document.getElementById("segment_merged").checked = true;
+			}
+			else{
+				document.getElementById("laytonize1").disabled = true;
+				document.getElementById("laytonize1").checked = false;
+				document.getElementById("laytonize2").disabled = true;
+				document.getElementById("laytonize2").checked = false;
+				document.getElementById("segment_merged").disabled = true;
+				document.getElementById("segment_merged").checked = false;
+			}
+		}
 		</script>
 		<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
 	<script src="https://cdn.jsdelivr.net/bxslider/4.2.12/jquery.bxslider.min.js"></script>
diff --git a/nlp_form.py b/nlp_form.py
@@ -66,6 +66,8 @@ def make_nlp_form(access_level, mode):
 		do_lemma = True
 		do_tag = True
 		do_parse = True
+		detok = 0
+		segment_merged = False
 		do_tok = True
 		do_norm = True
 		do_mwe = True
@@ -85,6 +87,13 @@ def make_nlp_form(access_level, mode):
 			do_parse = form.getvalue("parse") is not None
 			do_norm = form.getvalue("norm") is not None
 			do_mwe = form.getvalue("mwe") is not None
+			if form.getvalue("laytonize") == "aggressive":
+				detok = 2
+			elif form.getvalue("laytonize") == "conservative":
+				detok = 1
+			else:
+				detok = 0
+			segment_merged = form.getvalue("segment_merged") is not None
 			do_tok = form.getvalue("tok") is not None
 			do_lang = form.getvalue("lang") is not None
 			if sgml_mode == "pipes":
@@ -95,7 +104,8 @@ def make_nlp_form(access_level, mode):
 			else:
 				processed = nlp_coptic(data,lb=lb=="line",parse_only=False,do_tok=do_tok,do_norm=do_norm,do_mwe=do_mwe,
 									   do_tag=do_tag, do_lemma=do_lemma,do_lang=do_lang,do_milestone=do_milestone,
-									   do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok)
+									   do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok,
+									   detokenize=detok, segment_merged=segment_merged)
 				processed = processed.strip()
 
 		###
@@ -115,6 +125,10 @@ def make_nlp_form(access_level, mode):
 		noline_checked = ' checked="checked"' if lb else ""
 		tok_checked = ' checked="checked"' if do_tok else ""
 		old_checked = ' checked="checked"' if old_tok else ""
+		segment_merged_checked = ' checked="checked"' if segment_merged else ""
+		detokenize_checked = ' checked="checked"' if detok > 0 else ""
+		laytonize_conservative_checked = ' checked="checked"' if detok == 1 else ""
+		laytonize_aggressive_checked = ' checked="checked"' if detok == 2 else ""
 		auto_checked = ' checked="checked"' if tok_mode == "auto" else ""
 		pipes_checked = ' checked="checked"' if tok_mode == "from_pipes" else ""
 		norm_checked = ' checked="checked"' if do_norm else ""
@@ -154,6 +168,10 @@ def make_nlp_form(access_level, mode):
 		template = template.replace("**old_checked**", old_checked)
 		template = template.replace("**milestone_checked**", milestone_checked)
 		template = template.replace("**tok_checked**", tok_checked)
+		template = template.replace("**detokenize_checked**", detokenize_checked)
+		template = template.replace("**laytonize_conservative_checked**", laytonize_conservative_checked)
+		template = template.replace("**laytonize_aggressive_checked**", laytonize_aggressive_checked)
+		template = template.replace("**segment_merged_checked**", segment_merged_checked)
 		template = template.replace("**auto_checked**", auto_checked)
 		template = template.replace("**pipes_checked**", pipes_checked)
 		template = template.replace("**norm_checked**", norm_checked)