CopticScriptorium
diff --git a/‎.gitignore
Lines changed: 9 additions & 0 deletions b/‎.gitignore
Lines changed: 9 additions & 0 deletions
diff --git a/‎.idea/dictionaries/luke.xml
Lines changed: 3 additions & 0 deletions b/‎.idea/dictionaries/luke.xml
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 12 additions & 4 deletions b/‎README.md
Lines changed: 12 additions & 4 deletions
diff --git a/‎_version.py
Lines changed: 2 additions & 2 deletions b/‎_version.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎api.py
Lines changed: 11 additions & 6 deletions b/‎api.py
Lines changed: 11 additions & 6 deletions
diff --git a/‎bin/coptic.mco
441 KB b/‎bin/coptic.mco
441 KB
diff --git a/‎bin/coptic_foma.bin
893 KB b/‎bin/coptic_foma.bin
893 KB
diff --git a/‎bin/foma/README.md
Lines changed: 21 additions & 0 deletions b/‎bin/foma/README.md
Lines changed: 21 additions & 0 deletions
@@ -0,0 +1,9 @@
+*.pyc
+__pycache__/
+*.swp
+.*.tmp
+.idea
+/_scrap/
+/.idea/
+_tmp*.tab
+errors/
@@ -63,9 +63,9 @@ standard module options:
 
 less common options:
   -f, --finitestate     Use old finite-state tokenizer (less accurate)
-  -d {0,1,2}, --detokenize {0,1,2}
+  -d {0,1,2,3}, --detokenize {0,1,2,3}
                         Re-group non-standard bound groups (a.k.a.
-                        'laytonize') - 1=normal 2=aggressive
+                        'laytonize') - 1=normal 2=aggressive 3=smart
   --segment_merged      When re-grouping bound groups, assume merged groups
                         have segmentation boundary between them
   -q, --quiet           Suppress verbose messages
@@ -116,7 +116,7 @@ The pipeline accepts the following kinds of input:
   * Plain text, with bound groups separated by underscores or spaces. 
     * Note that if punctuation has not been separated from bound groups, you can use the `--space` option to attempt to automatically separate punctuation
     * If your Coptic text represents line breaks as new line characters, you can automatically add line break tags using `-b` / `--breaklines`
-    * Gold tokenization information may be present in the input at pipes between part-of-speech bearing units and hyphens between morphemes
+    * Gold tokenization information may be present in the input as pipes between part-of-speech bearing units and hyphens between morphemes
   * XML/SGML input, with bound groups separated by underscores or spaces. The script will retain XML tags as-is around Coptic text. 
   * Coptic Scriptorium style TreeTagger SGML, with normalized units in tags such as <norm norm="...">. 
     * This input format is used when adding a parse to an existing .tt file using the `--merge_parse` option
@@ -193,4 +193,12 @@ The pipeline accepts the following kinds of input:
 </norm>
 </norm_group>
 </lb>
-```
+```
+
+## Testing installation
+
+If all requirements are installed correctly, you can verify that modules are working correctly by running the built-in unit tests:
+
+```
+python run_tests.py
+```
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 
-__version__ = "1.1.0"
+__version__ = "3.0.0"
 __author__ = "Amir Zeldes"
-__copyright__ = "Copyright 2015-2016, Amir Zeldes"
+__copyright__ = "Copyright 2015-2019, Amir Zeldes"
 __license__ = "Apache 2.0 License"
@@ -1,15 +1,20 @@
-#!/usr/local/bin/python2.7
+#!/usr/bin/python3.5
 # -*- coding: utf-8 -*-
 
-#from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
-
 #Example call on localhost:
 #http://localhost/coptic-nlp/api.py?data=%E2%B2%81%CF%A5%E2%B2%A5%E2%B2%B1%E2%B2%A7%E2%B2%99%20%E2%B2%9B%CF%AD%E2%B2%93%E2%B2%A1%E2%B2%A3%E2%B2%B1%E2%B2%99%E2%B2%89&lb=line
 
 from nlp_form import nlp_coptic
-import cgi
+import cgi, sys
+
+PY3 = sys.version_info[0] == 3
+if PY3:
+	sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
+
 storage = cgi.FieldStorage()
+#storage = {"data":"ⲁϥⲥⲱⲧⲙ ⲛϭⲓⲡⲣⲱⲙⲉ"}
 if "data" in storage:
+	#data = storage["data"]
 	data = storage.getvalue("data")
 else:
 	data = ""
@@ -40,12 +45,12 @@
 	print("Content-Type: text/sgml; charset=UTF-8\n")
 	# secure call, note that htaccess prevents this running without authentication
 	if "|" in data:
-		processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
+		processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True, do_mwe=False,
 							   do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
 							   do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
 							   tok_mode="from_pipes", old_tokenizer=False)
 	else:
-		processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
+		processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True, do_mwe=False,
 							   do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
 							   do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
 							   tok_mode="auto", old_tokenizer=False)
 
@@ -0,0 +1,21 @@
+This directory contains the files needed to compile a fresh transducer binary for the Foma based normalization module.
+
+To compile a new transducer based on the latest normalization info, make sure that foma binaries for your system are in bin/foma/ and that data/norm_table.tab contains the latest normalization data. Foma binaries are available for **Windows** and **Mac OSX** and will be automatically unzipped when you run coptic_nlp.py. 
+
+If you are running coptic_nlp.py on **Linux**, you will need to compile Foma (which should work):
+
+```
+wget https://bitbucket.org/mhulden/foma/downloads/foma-0.9.18.tar.gz
+tar -xvzf foma-0.9.18.tar.gz
+cd foma-0.9.18/
+make
+sudo make install
+```
+
+When you are ready, run:
+
+```
+> python compile_grammar.py
+```
+
+A new coptic_foma.bin will be generated in this folder, which should replace the existing coptic_foma.bin in bin/