已进行过测试，在conll09中文的UAS结果是83.50

cxzhu · cxzhu · commit 72d422350561 · 2014-10-09T14:56:07.000+08:00
diff --git a/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/JointParsingState.java b/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/JointParsingState.java
@@ -125,47 +125,20 @@ public ArrayList<String> getFeatures() {
 
 		int rightFocus = leftFocus + 1;
 
-//		ISparseVector vec = new HashSparseVector();
-		
-		StringBuilder posFeature1 = new StringBuilder();
-		//左右词性
-		posFeature1.append("+-2").append(POS).append(trees.get(leftFocus).pos)
-		.append("/").append(trees.get(rightFocus).pos);
-		featurelist.add(posFeature1.toString());
-		//左右词性
-		StringBuilder posFeature2 = new StringBuilder();
-		posFeature2.append("+-4").append(POS).append(trees.get(leftFocus).pos)
-			.append("/").append(trees.get(rightFocus).pos);
-		posFeature2.append("/");
-		if(leftFocus>0)
-			posFeature2.append(trees.get(leftFocus-1).pos);
-		posFeature2.append("/");
-		if(rightFocus<trees.size()-1)
-			posFeature2.append(trees.get(rightFocus+1).pos);
-		
-		featurelist.add(posFeature2.toString());
-		//左右词
-		StringBuilder lexFeature1 = new StringBuilder();
-		lexFeature1.append("+-2").append(LEX).append(trees.get(leftFocus).word)
-		.append("/").append(trees.get(rightFocus).word);
-		featurelist.add(lexFeature1.toString());
-		
-		
-		StringBuilder lexFeature2 = new StringBuilder();
-		lexFeature2.append("+-4").append(LEX).append(trees.get(leftFocus).word)
-			.append("/").append(trees.get(rightFocus).word);
-		lexFeature2.append("/");
-		if(leftFocus>0)
-			lexFeature2.append(trees.get(leftFocus-1).word);
-		lexFeature2.append("/");
-		if(rightFocus<trees.size()-1)
-			lexFeature2.append(trees.get(rightFocus+1).word);
-		
-		featurelist.add(lexFeature2.toString());
+//		ISparseVector vec = new HashSparseVector();
+		//所有的联合feature
+		featurelist.add(combinedFeature("+0+1", POS, new int[]{0, 1}));
+		featurelist.add(combinedFeature("-1+0+1", POS, new int[]{-1, 0, 1}));
+		featurelist.add(combinedFeature("+0+1+2", POS, new int[]{0, 1, 2}));
+		featurelist.add(combinedFeature("+1+2+3", POS, new int[]{1, 2, 3}));
+		featurelist.add(combinedFeature("-2+3+4", POS, new int[]{2, 3, 4}));
+		featurelist.add(combinedFeature("+0+1", LEX, new int[]{0, 1}));
+		featurelist.add(combinedFeature("-1+0+1", LEX, new int[]{-1, 0, 1}));
+		featurelist.add(combinedFeature("+0+1+2", LEX, new int[]{0, 1, 2}));
 
 		// 设定上下文窗口大小
 		int l = 2;
-		int r = 2;
+		int r = 4;
 		for (int i = 0; i <= l; i++) {
 			// 特征前缀
 			String posFeature = "-" + String.valueOf(i) + POS;
@@ -178,7 +151,11 @@ public ArrayList<String> getFeatures() {
 			String rcLexFeature = "-" + String.valueOf(i)
 					+ CH_R_LEX;
 			String rcPosFeature = "-" + String.valueOf(i)
-					+ CH_R_POS;
+					+ CH_R_POS;
+			String lcDepFeature = "-" + String.valueOf(i)
+					+ CH_L_DEP;			
+			String rcDepFeature = "-" + String.valueOf(i)
+					+ CH_R_DEP;
 
 			if (leftFocus - i < 0) {
 				featurelist.add(lexFeature + START + String.valueOf(i - leftFocus));
@@ -195,7 +172,9 @@ public ArrayList<String> getFeatures() {
 						featurelist.add(lcLexFeature
 								+ sent.words[leftChildIndex]);
 						featurelist.add(lcPosFeature
-								+ sent.tags[leftChildIndex]);
+								+ sent.tags[leftChildIndex]);
+						featurelist.add(lcDepFeature
+								+ sent.getDepClass(leftChildIndex));
 					}
 				}else{
 					featurelist.add(lcLexFeature + NULL);
@@ -210,7 +189,9 @@ public ArrayList<String> getFeatures() {
 						featurelist.add(rcLexFeature
 								+ sent.words[rightChildIndex]);
 						featurelist.add(rcPosFeature
-								+ sent.tags[rightChildIndex]);
+								+ sent.tags[rightChildIndex]);
+						featurelist.add(rcDepFeature
+								+ sent.getDepClass(rightChildIndex));
 					}
 				}else{
 					featurelist.add(rcLexFeature + NULL);
@@ -275,6 +256,51 @@ public ArrayList<String> getFeatures() {
 		
 		
 		return featurelist;
+	}
+	
+	/**
+	 * 
+	 * @param sign 
+	 * 		该类feature在字符串中的标志，如"-0+0"
+	 * @param posOrLex
+	 * 		该feature是取pos还是lex
+	 * @param locations
+	 * 		选取的这些联合feature的位置，以leftFocus为准的偏移量
+	 * @return 
+	 * 		联合feature的字符串形式
+	 */
+	private String combinedFeature(String sign, String posOrLex, int[] locations){
+		StringBuilder cf = new StringBuilder();
+		cf.append(sign);
+		cf.append(posOrLex);
+		for(int loc:locations){
+			int focus = leftFocus + loc;
+			if(isCrossBorder(focus)){
+				cf.append(NULL);
+			}
+			else{
+				cf.append(getPosOrLex(posOrLex, focus));
+			}
+			cf.append("/");
+		}
+		return cf.toString();
+	}
+	
+	private String getPosOrLex(String posOrLex, int focus){
+		if(posOrLex.equals(LEX)){
+			return trees.get(focus).word;
+		}
+		else if(posOrLex.equals(POS)){
+			return trees.get(focus).pos;
+		}
+		return null;
+	}
+	
+	private boolean isCrossBorder(int focus){
+		if(focus >= 0 && focus < trees.size()){
+			return false;	
+		}
+		return true;
 	}
 
 	public boolean isFinalState() {
diff --git a/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/analysis/AnalysisSentence.java b/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/analysis/AnalysisSentence.java
@@ -0,0 +1,24 @@
+package org.fnlp.nlp.parser.dep.analysis;
+
+public class AnalysisSentence {
+	public String forms[];
+	public String tags[];
+	public int goldhead[];
+	public String goldrel[];
+	public int predhead[];
+	public String predrel[];
+	public AnalysisSentence(String[] forms, String[] tags, int[] goldhead,
+			String[] goldrel, int[] predhead, String[] predrel) {
+		super();
+		this.forms = forms;
+		this.tags = tags;
+		this.goldhead = goldhead;
+		this.goldrel = goldrel;
+		this.predhead = predhead;
+		this.predrel = predrel;
+	}
+	
+	public int length(){
+		return forms.length;
+	}
+}
diff --git a/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/analysis/AnalysisTest.java b/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/analysis/AnalysisTest.java
@@ -0,0 +1,96 @@
+package org.fnlp.nlp.parser.dep.analysis;
+
+import java.io.IOException;
+
+public class AnalysisTest {
+	private int errhead = 0;
+	private int headsum = 0;
+	private int errhead_dep = 0;
+	private int head_depsum = 0;
+	private int errsent = 0;
+	private int sent_sum = 0;
+	private int errroot = 0;
+	private int root_sum = 0;
+
+	public void test(String resultFile) throws IOException{
+		ResultReader reader = new ResultReader(resultFile);		
+		while(reader.hasNext()){
+			AnalysisSentence sent = reader.next();			
+			//分析一下UAS,LAS,CM,ROOT
+			judgeUAS(sent);	
+			judgeLAS(sent);
+			judgeRoot(sent);
+		}
+		print();
+	}
+	
+	private void judgeRoot(AnalysisSentence sent){
+		for(int i=0; i<sent.length(); i++){
+			if(sent.goldhead[i] == -1){
+				root_sum++;				
+				if(sent.goldhead[i] != sent.predhead[i]){
+					errroot++;
+				}
+				continue;
+			}
+		}
+	}
+	
+	private void judgeLAS(AnalysisSentence sent){
+		for(int i=0; i<sent.length(); i++){
+			if(isBiaodian(i, sent))
+				continue;
+			head_depsum++;
+			if(sent.goldhead[i] == -1){
+				if(sent.goldhead[i] != sent.predhead[i]){
+					errhead_dep++;
+				}
+				continue;
+			}
+			if(sent.goldhead[i] != sent.predhead[i]
+					|| !sent.goldrel[i].equals(sent.predrel[i])){
+				errhead_dep++;
+			}
+		}
+	}
+	
+	private void judgeUAS(AnalysisSentence sent){
+		boolean isUEM = true;
+		for(int i=0; i<sent.length(); i++){
+			if(isBiaodian(i, sent))
+				continue;
+			headsum++;
+			if(sent.goldhead[i] != sent.predhead[i]){
+				errhead++;
+				isUEM = false;
+			}
+		}
+		if(!isUEM){
+			errsent++;
+		}
+		sent_sum++;
+	}
+	
+	private boolean isBiaodian(int i, AnalysisSentence sent){
+		String[] posBiaodian = new String[]{",",".","``",":","''","$","-RRB-","-LRB-","#","SYM", "PU"};
+		for(String s:posBiaodian){
+			if(sent.tags[i].equals(s)){
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	private void print(){
+		System.out.println("***************************************");
+		System.out.printf("rate(UAS):\t%.8f\ttotal(words):\t%d\n", 1 - 1.0
+				* errhead / headsum, headsum);
+		System.out.printf("rate(LAS):\t%.8f\ttotal(words):\t%d\n", 1 - 1.0
+				* errhead_dep / head_depsum, head_depsum);
+		System.out.printf("rate(root):\t%.8f\ttotal(roots):\t%d\n", 1 - 1.0
+				* errroot / root_sum, root_sum);
+		System.out.printf("rate(UEM):\t%.8f\ttotal(sent):\t%d\n", 1 - 1.0
+				* errsent / sent_sum, sent_sum);
+	}
+
+}
diff --git a/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/analysis/ResultReader.java b/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/analysis/ResultReader.java
@@ -0,0 +1,67 @@
+package org.fnlp.nlp.parser.dep.analysis;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+public class ResultReader {
+
+	BufferedReader reader = null;
+	AnalysisSentence next = null;
+	List<String[]> carrier = new ArrayList<String[]>();
+
+	public ResultReader(String filepath) throws IOException {
+		reader = new BufferedReader(new InputStreamReader(new FileInputStream(
+				filepath), "UTF-8"));
+		advance();
+	}
+
+	private void advance() throws IOException {
+		String line = null;
+		carrier.clear();
+		while ((line = reader.readLine()) != null) {
+			line = line.trim();
+			if (line.matches("^$"))
+				break;
+			carrier.add(line.split("\\t+|\\s+"));
+		}
+
+		next = null;
+		if (!carrier.isEmpty()) {
+			String[] forms = new String[carrier.size()];
+			String[] tags = new String[carrier.size()];
+			int[] goldhead = new int[carrier.size()];
+			String[] goldrel = new String[carrier.size()];
+			int[] predhead = new int[carrier.size()];
+			String[] predrel = new String[carrier.size()];
+			for (int i = 0; i < carrier.size(); i++) {
+				String[] tokens = carrier.get(i);
+				forms[i] = tokens[0];
+				tags[i] = tokens[1];
+				goldhead[i] = Integer.parseInt(tokens[2]);
+				goldrel[i] = tokens[3];
+				predhead[i] = Integer.parseInt(tokens[4]);
+				predrel[i] = tokens[5];
+			}
+
+			next = new AnalysisSentence(forms, tags, goldhead, goldrel, predhead, predrel);
+		}
+	}
+
+	public boolean hasNext() {
+		return (next != null);
+	}
+
+	public AnalysisSentence next() {
+		AnalysisSentence cur = next;
+		try {
+			advance();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		return cur;
+	}
+}
diff --git a/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/train/JointParerTester.java b/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/train/JointParerTester.java
@@ -23,14 +23,15 @@
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
-
+
 import org.apache.commons.cli.BasicParser;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.Options;
 import org.fnlp.nlp.parser.Sentence;
 import org.fnlp.nlp.parser.Target;
 import org.fnlp.nlp.parser.dep.JointParser;
+import org.fnlp.nlp.parser.dep.analysis.AnalysisTest;
 import org.fnlp.nlp.parser.dep.reader.CoNLLReader;
 import org.fnlp.nlp.parser.dep.reader.FNLPReader;
 import org.fnlp.nlp.parser.dep.reader.Malt2Reader;
@@ -128,17 +129,20 @@ public void test(String testFile, String resultFile, String charset)
 
 		float time = (endTime - beginTime) / 1000.0f;
 		System.out.println("finish! =]");
-		System.out.printf("total time:\t%.2f(s)\n", time);
-		System.out.printf("accuracy(depClass):\t%.8f\ttotal(words):\t%d\n",  1.0-1.0
+		System.out.printf("total time:\t%.2f(s)\n", time);
+		System.out.printf("average speed:\t%.4f(s/word)\t%.4f(s/sent)",  total
+				/ time, totsent / time);
+		System.out.println();
+		AnalysisTest at = new AnalysisTest();
+		at.test(resultFile);
+		/*System.out.printf("accuracy(depClass):\t%.8f\ttotal(words):\t%d\n",  1.0-1.0
 				* dError / total, total);
 		System.out.printf("accuracy(heads):\t%.8f\ttotal(words):\t%d\n",  1.0-1.0
 				* error / total, total);
 		System.out.printf("accuracy(sents):\t%.8f\ttotal(sents):\t%d\n", 1.0-1.0
 				* errsent / totsent, totsent);
 		System.out.printf("accuracy(root):\t%.8f\ttotal(root):\t%d\n", 1.0- 1.0
-				* errroot / totsent, totsent);
-		System.out.printf("average speed:\t%.4f(s/word)\t%.4f(s/sent)",  total
-				/ time, totsent / time);
+				* errroot / totsent, totsent);*/		
 	}
 
 	private void writeTo(BufferedWriter writer, Sentence instance, Target t)
diff --git a/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/train/JointParerTrainer.java b/fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/train/JointParerTrainer.java