Skip to content

Commit 72d4223

Browse files
author
cxzhu
committed
已进行过测试,在conll09中文的UAS结果是83.50
1 parent 8cf116d commit 72d4223

File tree

6 files changed

+295
-62
lines changed

6 files changed

+295
-62
lines changed

fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/JointParsingState.java

Lines changed: 67 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -125,47 +125,20 @@ public ArrayList<String> getFeatures() {
125125

126126
int rightFocus = leftFocus + 1;
127127

128-
// ISparseVector vec = new HashSparseVector();
129-
130-
StringBuilder posFeature1 = new StringBuilder();
131-
//左右词性
132-
posFeature1.append("+-2").append(POS).append(trees.get(leftFocus).pos)
133-
.append("/").append(trees.get(rightFocus).pos);
134-
featurelist.add(posFeature1.toString());
135-
//左右词性
136-
StringBuilder posFeature2 = new StringBuilder();
137-
posFeature2.append("+-4").append(POS).append(trees.get(leftFocus).pos)
138-
.append("/").append(trees.get(rightFocus).pos);
139-
posFeature2.append("/");
140-
if(leftFocus>0)
141-
posFeature2.append(trees.get(leftFocus-1).pos);
142-
posFeature2.append("/");
143-
if(rightFocus<trees.size()-1)
144-
posFeature2.append(trees.get(rightFocus+1).pos);
145-
146-
featurelist.add(posFeature2.toString());
147-
//左右词
148-
StringBuilder lexFeature1 = new StringBuilder();
149-
lexFeature1.append("+-2").append(LEX).append(trees.get(leftFocus).word)
150-
.append("/").append(trees.get(rightFocus).word);
151-
featurelist.add(lexFeature1.toString());
152-
153-
154-
StringBuilder lexFeature2 = new StringBuilder();
155-
lexFeature2.append("+-4").append(LEX).append(trees.get(leftFocus).word)
156-
.append("/").append(trees.get(rightFocus).word);
157-
lexFeature2.append("/");
158-
if(leftFocus>0)
159-
lexFeature2.append(trees.get(leftFocus-1).word);
160-
lexFeature2.append("/");
161-
if(rightFocus<trees.size()-1)
162-
lexFeature2.append(trees.get(rightFocus+1).word);
163-
164-
featurelist.add(lexFeature2.toString());
128+
// ISparseVector vec = new HashSparseVector();
129+
//所有的联合feature
130+
featurelist.add(combinedFeature("+0+1", POS, new int[]{0, 1}));
131+
featurelist.add(combinedFeature("-1+0+1", POS, new int[]{-1, 0, 1}));
132+
featurelist.add(combinedFeature("+0+1+2", POS, new int[]{0, 1, 2}));
133+
featurelist.add(combinedFeature("+1+2+3", POS, new int[]{1, 2, 3}));
134+
featurelist.add(combinedFeature("-2+3+4", POS, new int[]{2, 3, 4}));
135+
featurelist.add(combinedFeature("+0+1", LEX, new int[]{0, 1}));
136+
featurelist.add(combinedFeature("-1+0+1", LEX, new int[]{-1, 0, 1}));
137+
featurelist.add(combinedFeature("+0+1+2", LEX, new int[]{0, 1, 2}));
165138

166139
// 设定上下文窗口大小
167140
int l = 2;
168-
int r = 2;
141+
int r = 4;
169142
for (int i = 0; i <= l; i++) {
170143
// 特征前缀
171144
String posFeature = "-" + String.valueOf(i) + POS;
@@ -178,7 +151,11 @@ public ArrayList<String> getFeatures() {
178151
String rcLexFeature = "-" + String.valueOf(i)
179152
+ CH_R_LEX;
180153
String rcPosFeature = "-" + String.valueOf(i)
181-
+ CH_R_POS;
154+
+ CH_R_POS;
155+
String lcDepFeature = "-" + String.valueOf(i)
156+
+ CH_L_DEP;
157+
String rcDepFeature = "-" + String.valueOf(i)
158+
+ CH_R_DEP;
182159

183160
if (leftFocus - i < 0) {
184161
featurelist.add(lexFeature + START + String.valueOf(i - leftFocus));
@@ -195,7 +172,9 @@ public ArrayList<String> getFeatures() {
195172
featurelist.add(lcLexFeature
196173
+ sent.words[leftChildIndex]);
197174
featurelist.add(lcPosFeature
198-
+ sent.tags[leftChildIndex]);
175+
+ sent.tags[leftChildIndex]);
176+
featurelist.add(lcDepFeature
177+
+ sent.getDepClass(leftChildIndex));
199178
}
200179
}else{
201180
featurelist.add(lcLexFeature + NULL);
@@ -210,7 +189,9 @@ public ArrayList<String> getFeatures() {
210189
featurelist.add(rcLexFeature
211190
+ sent.words[rightChildIndex]);
212191
featurelist.add(rcPosFeature
213-
+ sent.tags[rightChildIndex]);
192+
+ sent.tags[rightChildIndex]);
193+
featurelist.add(rcDepFeature
194+
+ sent.getDepClass(rightChildIndex));
214195
}
215196
}else{
216197
featurelist.add(rcLexFeature + NULL);
@@ -275,6 +256,51 @@ public ArrayList<String> getFeatures() {
275256

276257

277258
return featurelist;
259+
}
260+
261+
/**
262+
*
263+
* @param sign
264+
* 该类feature在字符串中的标志,如"-0+0"
265+
* @param posOrLex
266+
* 该feature是取pos还是lex
267+
* @param locations
268+
* 选取的这些联合feature的位置,以leftFocus为准的偏移量
269+
* @return
270+
* 联合feature的字符串形式
271+
*/
272+
private String combinedFeature(String sign, String posOrLex, int[] locations){
273+
StringBuilder cf = new StringBuilder();
274+
cf.append(sign);
275+
cf.append(posOrLex);
276+
for(int loc:locations){
277+
int focus = leftFocus + loc;
278+
if(isCrossBorder(focus)){
279+
cf.append(NULL);
280+
}
281+
else{
282+
cf.append(getPosOrLex(posOrLex, focus));
283+
}
284+
cf.append("/");
285+
}
286+
return cf.toString();
287+
}
288+
289+
private String getPosOrLex(String posOrLex, int focus){
290+
if(posOrLex.equals(LEX)){
291+
return trees.get(focus).word;
292+
}
293+
else if(posOrLex.equals(POS)){
294+
return trees.get(focus).pos;
295+
}
296+
return null;
297+
}
298+
299+
private boolean isCrossBorder(int focus){
300+
if(focus >= 0 && focus < trees.size()){
301+
return false;
302+
}
303+
return true;
278304
}
279305

280306
public boolean isFinalState() {
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package org.fnlp.nlp.parser.dep.analysis;
2+
3+
public class AnalysisSentence {
4+
public String forms[];
5+
public String tags[];
6+
public int goldhead[];
7+
public String goldrel[];
8+
public int predhead[];
9+
public String predrel[];
10+
public AnalysisSentence(String[] forms, String[] tags, int[] goldhead,
11+
String[] goldrel, int[] predhead, String[] predrel) {
12+
super();
13+
this.forms = forms;
14+
this.tags = tags;
15+
this.goldhead = goldhead;
16+
this.goldrel = goldrel;
17+
this.predhead = predhead;
18+
this.predrel = predrel;
19+
}
20+
21+
public int length(){
22+
return forms.length;
23+
}
24+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package org.fnlp.nlp.parser.dep.analysis;
2+
3+
import java.io.IOException;
4+
5+
public class AnalysisTest {
6+
private int errhead = 0;
7+
private int headsum = 0;
8+
private int errhead_dep = 0;
9+
private int head_depsum = 0;
10+
private int errsent = 0;
11+
private int sent_sum = 0;
12+
private int errroot = 0;
13+
private int root_sum = 0;
14+
15+
public void test(String resultFile) throws IOException{
16+
ResultReader reader = new ResultReader(resultFile);
17+
while(reader.hasNext()){
18+
AnalysisSentence sent = reader.next();
19+
//分析一下UAS,LAS,CM,ROOT
20+
judgeUAS(sent);
21+
judgeLAS(sent);
22+
judgeRoot(sent);
23+
}
24+
print();
25+
}
26+
27+
private void judgeRoot(AnalysisSentence sent){
28+
for(int i=0; i<sent.length(); i++){
29+
if(sent.goldhead[i] == -1){
30+
root_sum++;
31+
if(sent.goldhead[i] != sent.predhead[i]){
32+
errroot++;
33+
}
34+
continue;
35+
}
36+
}
37+
}
38+
39+
private void judgeLAS(AnalysisSentence sent){
40+
for(int i=0; i<sent.length(); i++){
41+
if(isBiaodian(i, sent))
42+
continue;
43+
head_depsum++;
44+
if(sent.goldhead[i] == -1){
45+
if(sent.goldhead[i] != sent.predhead[i]){
46+
errhead_dep++;
47+
}
48+
continue;
49+
}
50+
if(sent.goldhead[i] != sent.predhead[i]
51+
|| !sent.goldrel[i].equals(sent.predrel[i])){
52+
errhead_dep++;
53+
}
54+
}
55+
}
56+
57+
private void judgeUAS(AnalysisSentence sent){
58+
boolean isUEM = true;
59+
for(int i=0; i<sent.length(); i++){
60+
if(isBiaodian(i, sent))
61+
continue;
62+
headsum++;
63+
if(sent.goldhead[i] != sent.predhead[i]){
64+
errhead++;
65+
isUEM = false;
66+
}
67+
}
68+
if(!isUEM){
69+
errsent++;
70+
}
71+
sent_sum++;
72+
}
73+
74+
private boolean isBiaodian(int i, AnalysisSentence sent){
75+
String[] posBiaodian = new String[]{",",".","``",":","''","$","-RRB-","-LRB-","#","SYM", "PU"};
76+
for(String s:posBiaodian){
77+
if(sent.tags[i].equals(s)){
78+
return true;
79+
}
80+
}
81+
return false;
82+
}
83+
84+
private void print(){
85+
System.out.println("***************************************");
86+
System.out.printf("rate(UAS):\t%.8f\ttotal(words):\t%d\n", 1 - 1.0
87+
* errhead / headsum, headsum);
88+
System.out.printf("rate(LAS):\t%.8f\ttotal(words):\t%d\n", 1 - 1.0
89+
* errhead_dep / head_depsum, head_depsum);
90+
System.out.printf("rate(root):\t%.8f\ttotal(roots):\t%d\n", 1 - 1.0
91+
* errroot / root_sum, root_sum);
92+
System.out.printf("rate(UEM):\t%.8f\ttotal(sent):\t%d\n", 1 - 1.0
93+
* errsent / sent_sum, sent_sum);
94+
}
95+
96+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package org.fnlp.nlp.parser.dep.analysis;
2+
3+
import java.io.BufferedReader;
4+
import java.io.FileInputStream;
5+
import java.io.IOException;
6+
import java.io.InputStreamReader;
7+
import java.util.ArrayList;
8+
import java.util.List;
9+
10+
public class ResultReader {
11+
12+
BufferedReader reader = null;
13+
AnalysisSentence next = null;
14+
List<String[]> carrier = new ArrayList<String[]>();
15+
16+
public ResultReader(String filepath) throws IOException {
17+
reader = new BufferedReader(new InputStreamReader(new FileInputStream(
18+
filepath), "UTF-8"));
19+
advance();
20+
}
21+
22+
private void advance() throws IOException {
23+
String line = null;
24+
carrier.clear();
25+
while ((line = reader.readLine()) != null) {
26+
line = line.trim();
27+
if (line.matches("^$"))
28+
break;
29+
carrier.add(line.split("\\t+|\\s+"));
30+
}
31+
32+
next = null;
33+
if (!carrier.isEmpty()) {
34+
String[] forms = new String[carrier.size()];
35+
String[] tags = new String[carrier.size()];
36+
int[] goldhead = new int[carrier.size()];
37+
String[] goldrel = new String[carrier.size()];
38+
int[] predhead = new int[carrier.size()];
39+
String[] predrel = new String[carrier.size()];
40+
for (int i = 0; i < carrier.size(); i++) {
41+
String[] tokens = carrier.get(i);
42+
forms[i] = tokens[0];
43+
tags[i] = tokens[1];
44+
goldhead[i] = Integer.parseInt(tokens[2]);
45+
goldrel[i] = tokens[3];
46+
predhead[i] = Integer.parseInt(tokens[4]);
47+
predrel[i] = tokens[5];
48+
}
49+
50+
next = new AnalysisSentence(forms, tags, goldhead, goldrel, predhead, predrel);
51+
}
52+
}
53+
54+
public boolean hasNext() {
55+
return (next != null);
56+
}
57+
58+
public AnalysisSentence next() {
59+
AnalysisSentence cur = next;
60+
try {
61+
advance();
62+
} catch (IOException e) {
63+
e.printStackTrace();
64+
}
65+
return cur;
66+
}
67+
}

fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/train/JointParerTester.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@
2323
import java.io.FileOutputStream;
2424
import java.io.IOException;
2525
import java.io.OutputStreamWriter;
26-
26+
2727
import org.apache.commons.cli.BasicParser;
2828
import org.apache.commons.cli.CommandLine;
2929
import org.apache.commons.cli.HelpFormatter;
3030
import org.apache.commons.cli.Options;
3131
import org.fnlp.nlp.parser.Sentence;
3232
import org.fnlp.nlp.parser.Target;
3333
import org.fnlp.nlp.parser.dep.JointParser;
34+
import org.fnlp.nlp.parser.dep.analysis.AnalysisTest;
3435
import org.fnlp.nlp.parser.dep.reader.CoNLLReader;
3536
import org.fnlp.nlp.parser.dep.reader.FNLPReader;
3637
import org.fnlp.nlp.parser.dep.reader.Malt2Reader;
@@ -128,17 +129,20 @@ public void test(String testFile, String resultFile, String charset)
128129

129130
float time = (endTime - beginTime) / 1000.0f;
130131
System.out.println("finish! =]");
131-
System.out.printf("total time:\t%.2f(s)\n", time);
132-
System.out.printf("accuracy(depClass):\t%.8f\ttotal(words):\t%d\n", 1.0-1.0
132+
System.out.printf("total time:\t%.2f(s)\n", time);
133+
System.out.printf("average speed:\t%.4f(s/word)\t%.4f(s/sent)", total
134+
/ time, totsent / time);
135+
System.out.println();
136+
AnalysisTest at = new AnalysisTest();
137+
at.test(resultFile);
138+
/*System.out.printf("accuracy(depClass):\t%.8f\ttotal(words):\t%d\n", 1.0-1.0
133139
* dError / total, total);
134140
System.out.printf("accuracy(heads):\t%.8f\ttotal(words):\t%d\n", 1.0-1.0
135141
* error / total, total);
136142
System.out.printf("accuracy(sents):\t%.8f\ttotal(sents):\t%d\n", 1.0-1.0
137143
* errsent / totsent, totsent);
138144
System.out.printf("accuracy(root):\t%.8f\ttotal(root):\t%d\n", 1.0- 1.0
139-
* errroot / totsent, totsent);
140-
System.out.printf("average speed:\t%.4f(s/word)\t%.4f(s/sent)", total
141-
/ time, totsent / time);
145+
* errroot / totsent, totsent);*/
142146
}
143147

144148
private void writeTo(BufferedWriter writer, Sentence instance, Target t)

0 commit comments

Comments
 (0)