Skip to content

Commit 7b0f528

Browse files
committed
crf模型读取,大幅度提高了oov的识别
1 parent 03f1bba commit 7b0f528

36 files changed

+1144
-1531
lines changed

README.md~

Lines changed: 0 additions & 90 deletions
This file was deleted.

plug/ansj_lucene4_plug/src/test/java/org/ansj/ansj_lucene4_plug/Test.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,26 @@
11
package org.ansj.ansj_lucene4_plug;
22

33
import org.apache.lucene.index.Term;
4+
import org.apache.lucene.search.FieldValueFilter;
5+
import org.apache.lucene.search.FilteredQuery;
46
import org.apache.lucene.search.spans.SpanQuery;
57
import org.apache.lucene.search.spans.SpanTermQuery;
8+
import org.apache.lucene.search.vectorhighlight.FieldQuery;
69

710
public class Test {
811
public static void main(String[] args) {
912
Term term=new Term("category","/technology/computers/programming");
1013

1114
SpanQuery query=new SpanTermQuery(term);
1215

16+
17+
FieldValueFilter fvf = new FieldValueFilter("_id") ;
18+
19+
FilteredQuery filteredQuery = new FilteredQuery(query, fvf) ;
20+
21+
System.out.println(filteredQuery);
22+
23+
System.out.println(fvf);
1324
System.out.println(query);
1425

1526
}

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<artifactId>ansj_seg</artifactId>
66
<packaging>jar</packaging>
77
<name>ansj_seg</name>
8-
<version>1.0.alpha</version>
8+
<version>1.1.alpha</version>
99
<description>best java chinese word seg ! </description>
1010
<url>https://github.com/ansjsun/ansj_seg</url>
1111
<licenses>
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
package org.ansj.app.crf;
2+
3+
import java.io.BufferedInputStream;
4+
import java.io.BufferedOutputStream;
5+
import java.io.FileInputStream;
6+
import java.io.FileNotFoundException;
7+
import java.io.FileOutputStream;
8+
import java.io.IOException;
9+
import java.io.InputStream;
10+
import java.io.ObjectInputStream;
11+
import java.io.ObjectOutputStream;
12+
import java.util.ArrayList;
13+
import java.util.List;
14+
import java.util.Map;
15+
import java.util.Map.Entry;
16+
import java.util.zip.GZIPInputStream;
17+
import java.util.zip.GZIPOutputStream;
18+
19+
import love.cq.domain.SmartForest;
20+
21+
import org.ansj.app.crf.pojo.Element;
22+
import org.ansj.app.crf.pojo.Feature;
23+
import org.ansj.app.crf.pojo.Template;
24+
25+
public abstract class Model {
26+
27+
public static enum MODEL_TYPE {
28+
CRF, EMM
29+
};
30+
31+
protected Template template = null;
32+
33+
protected double[][] status = null;
34+
35+
protected Map<String, Feature> myGrad;
36+
37+
protected SmartForest<double[][]> smartForest = null;
38+
39+
public int allFeatureCount = 0;
40+
41+
private List<Element> leftList = null;
42+
43+
private List<Element> rightList = null;
44+
45+
public int end1;
46+
47+
public int end2;
48+
49+
/**
50+
* 根据模板文件解析特征
51+
*
52+
* @param template
53+
* @throws IOException
54+
*/
55+
private void makeSide(int left, int right) throws IOException {
56+
// TODO Auto-generated method stub
57+
58+
leftList = new ArrayList<Element>(Math.abs(left));
59+
for (int i = left; i < 0; i++) {
60+
leftList.add(new Element((char) ('B' + i)));
61+
}
62+
63+
rightList = new ArrayList<Element>(right);
64+
for (int i = 1; i < right + 1; i++) {
65+
rightList.add(new Element((char) ('B' + i)));
66+
}
67+
}
68+
69+
/**
70+
* 讲模型写入
71+
*
72+
* @param path
73+
* @throws FileNotFoundException
74+
* @throws IOException
75+
*/
76+
public void writeModel(String path) throws FileNotFoundException, IOException {
77+
// TODO Auto-generated method stub
78+
79+
System.out.println("compute ok now to save model!");
80+
// 写模型
81+
ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(path))));
82+
83+
// 配置模板
84+
oos.writeObject(template);
85+
// 特征转移率
86+
oos.writeObject(status);
87+
// 总共的特征数
88+
oos.writeInt(myGrad.size());
89+
double[] ds = null;
90+
for (Entry<String, Feature> entry : myGrad.entrySet()) {
91+
oos.writeUTF(entry.getKey());
92+
for (int i = 0; i < template.ft.length; i++) {
93+
ds = entry.getValue().w[i];
94+
for (int j = 0; j < ds.length; j++) {
95+
oos.writeByte(j);
96+
oos.writeFloat((float) ds[j]);
97+
}
98+
oos.writeByte(-1);
99+
}
100+
}
101+
102+
oos.flush();
103+
oos.close();
104+
105+
}
106+
107+
/**
108+
* 模型读取
109+
*
110+
* @param path
111+
* @return
112+
* @return
113+
* @throws FileNotFoundException
114+
* @throws IOException
115+
* @throws ClassNotFoundException
116+
*/
117+
public static Model loadModel(String modelPath) throws Exception {
118+
return loadModel(new FileInputStream(modelPath));
119+
120+
}
121+
122+
public static Model loadModel(InputStream modelStream) throws Exception {
123+
ObjectInputStream ois = null;
124+
try {
125+
ois = new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(modelStream)));
126+
127+
Model model = new Model() {
128+
129+
@Override
130+
public void writeModel(String path) throws FileNotFoundException, IOException {
131+
// TODO Auto-generated method stub
132+
throw new RuntimeException("you can not to calculate ,this model only use by cut ");
133+
}
134+
135+
};
136+
137+
model.template = (Template) ois.readObject();
138+
139+
model.makeSide(model.template.left, model.template.right);
140+
141+
int tagNum = model.template.tagNum;
142+
143+
int featureNum = model.template.ft.length;
144+
145+
model.smartForest = new SmartForest<double[][]>(0.8);
146+
147+
model.status = (double[][]) ois.readObject();
148+
149+
// 总共的特征数
150+
double[][] w = null;
151+
String key = null;
152+
int b = 0;
153+
int featureCount = ois.readInt();
154+
for (int i = 0; i < featureCount; i++) {
155+
key = ois.readUTF();
156+
w = new double[featureNum][0];
157+
for (int j = 0; j < featureNum; j++) {
158+
while ((b = ois.readByte()) != -1) {
159+
if (w[j].length == 0) {
160+
w[j] = new double[tagNum];
161+
}
162+
w[j][b] = ois.readFloat();
163+
}
164+
}
165+
model.smartForest.add(key, w);
166+
}
167+
168+
return model;
169+
} finally {
170+
if (ois != null) {
171+
ois.close();
172+
}
173+
}
174+
}
175+
176+
public double[] getFeature(int featureIndex, char... chars) {
177+
// TODO Auto-generated method stub
178+
SmartForest<double[][]> sf = smartForest;
179+
sf = sf.getBranch(chars);
180+
if (sf == null || sf.getParam() == null) {
181+
return null;
182+
}
183+
return sf.getParam()[featureIndex];
184+
}
185+
186+
/**
187+
* tag转移率
188+
*
189+
* @param s1
190+
* @param s2
191+
* @return
192+
*/
193+
public double tagRate(int s1, int s2) {
194+
// TODO Auto-generated method stub
195+
return status[s1][s2];
196+
}
197+
198+
}

0 commit comments

Comments
 (0)