Skip to content

Commit ab5a16f

Browse files
committed
做了一些nlp分词方式的改动
1 parent d6748ee commit ab5a16f

26 files changed

+3990
-45604
lines changed

pom.xml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<artifactId>ansj_seg</artifactId>
66
<packaging>jar</packaging>
77
<name>ansj_seg</name>
8-
<version>1.1.alpha</version>
8+
<version>1.1</version>
99
<description>best java chinese word seg ! </description>
1010
<url>https://github.com/ansjsun/ansj_seg</url>
1111
<licenses>
@@ -64,7 +64,6 @@
6464
</configuration>
6565
</plugin>
6666

67-
6867
<plugin>
6968
<artifactId>maven-deploy-plugin</artifactId>
7069
<version>2.7</version>

src/main/java/org/ansj/dic/LearnTool.java

Lines changed: 130 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import org.ansj.domain.NewWord;
1111
import org.ansj.domain.TermNatures;
1212
import org.ansj.recognition.AsianPersonRecognition;
13-
import org.ansj.recognition.CompanyRecogntion;
1413
import org.ansj.recognition.ForeignPersonRecognition;
1514
import org.ansj.util.Graph;
1615

@@ -22,149 +21,134 @@
2221
*/
2322
public class LearnTool {
2423

25-
/**
26-
* 是否开启学习机
27-
*/
28-
public boolean isCompany = true;
29-
30-
public boolean isAsianName = true;
31-
32-
public boolean isForeignName = true;
33-
34-
/**
35-
* 告诉大家你学习了多少个词了
36-
*/
37-
public int count;
38-
39-
/**
40-
* 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做.
41-
*/
42-
private final SmartForest<NewWord> sf = new SmartForest<NewWord>();
43-
44-
/**
45-
* 公司名称学习.
46-
*
47-
* @param graph
48-
*/
49-
public void learn(Graph graph) {
50-
51-
// 亚洲人名识别
52-
if (isAsianName) {
53-
findAsianPerson(graph);
54-
}
55-
56-
// 外国人名识别
57-
if (isForeignName) {
58-
findForeignPerson(graph);
59-
}
60-
61-
}
62-
63-
private void findAsianPerson(Graph graph) {
64-
List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords();
65-
addListToTerm(newWords);
66-
}
67-
68-
private void findForeignPerson(Graph graph) {
69-
List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords();
70-
addListToTerm(newWords);
71-
}
72-
73-
/**
74-
* 公司名称查找
75-
*
76-
* @param graph
77-
*/
78-
private void findCompany(Graph graph) {
79-
List<NewWord> newWords = new CompanyRecogntion(graph.terms).getNewWords();
80-
addListToTerm(newWords);
81-
}
82-
83-
// 批量将新词加入到词典中
84-
private void addListToTerm(List<NewWord> newWords) {
85-
if (newWords.size() == 0)
86-
return;
87-
88-
for (NewWord newWord : newWords) {
89-
newWord.setScore(-1);
90-
addTerm(newWord);
91-
}
92-
}
93-
94-
95-
/**
96-
* 增加一个新词到树中
97-
*
98-
* @param newWord
99-
*/
100-
public void addTerm(NewWord newWord) {
101-
NewWord temp = null;
102-
SmartForest<NewWord> smartForest = null;
103-
if ((smartForest = sf.getBranch(newWord.getName())) != null
104-
&& smartForest.getParam() != null) {
105-
temp = smartForest.getParam();
106-
temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq());
107-
} else {
108-
count++;
109-
// 设置名字为空,节省内存空间
110-
synchronized (sf) {
111-
sf.add(newWord.getName(), newWord);
112-
}
113-
}
114-
}
115-
116-
public SmartForest<NewWord> getForest() {
117-
return this.sf;
118-
}
119-
120-
/**
121-
* 返回学习到的新词.
122-
*
123-
* @param num
124-
* 返回数目.0为全部返回
125-
* @return
126-
*/
127-
public List<Entry<String, Double>> getTopTree(int num) {
128-
return getTopTree(num, null);
129-
}
130-
131-
public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) {
132-
if (sf.branches == null) {
133-
return null;
134-
}
135-
HashMap<String, Double> hm = new HashMap<String, Double>();
136-
for (int i = 0; i < sf.branches.length; i++) {
137-
valueResult(sf.branches[i], hm, nature);
138-
}
139-
List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
140-
if (num == 0) {
141-
return sortMapByValue;
142-
} else {
143-
num = Math.min(num, sortMapByValue.size());
144-
return sortMapByValue.subList(0, num);
145-
}
146-
}
147-
148-
private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm,
149-
TermNatures nature) {
150-
// TODO Auto-generated method stub
151-
if (smartForest == null || smartForest.branches==null) {
152-
return ;
153-
}
154-
for (int i = 0; i < smartForest.branches.length; i++) {
155-
NewWord param = smartForest.branches[i].getParam();
156-
if (smartForest.branches[i].getStatus() == 3) {
157-
if (nature == null || param.getNature().equals(nature)) {
158-
hm.put(param.getName(), param.getScore());
159-
}
160-
} else if (smartForest.branches[i].getStatus() == 2) {
161-
if (nature == null || param.getNature().equals(nature)) {
162-
hm.put(param.getName(), param.getScore());
163-
}
164-
valueResult(smartForest.branches[i], hm, nature);
165-
} else {
166-
valueResult(smartForest.branches[i], hm, nature);
167-
}
168-
}
169-
}
24+
/**
25+
* 是否开启学习机
26+
*/
27+
public boolean isAsianName = true;
28+
29+
public boolean isForeignName = true;
30+
31+
/**
32+
* 告诉大家你学习了多少个词了
33+
*/
34+
public int count;
35+
36+
/**
37+
* 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做.
38+
*/
39+
private final SmartForest<NewWord> sf = new SmartForest<NewWord>();
40+
41+
/**
42+
* 公司名称学习.
43+
*
44+
* @param graph
45+
*/
46+
public void learn(Graph graph) {
47+
48+
// 亚洲人名识别
49+
if (isAsianName) {
50+
findAsianPerson(graph);
51+
}
52+
53+
// 外国人名识别
54+
if (isForeignName) {
55+
findForeignPerson(graph);
56+
}
57+
58+
}
59+
60+
private void findAsianPerson(Graph graph) {
61+
List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords();
62+
addListToTerm(newWords);
63+
}
64+
65+
private void findForeignPerson(Graph graph) {
66+
List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords();
67+
addListToTerm(newWords);
68+
}
69+
70+
// 批量将新词加入到词典中
71+
private void addListToTerm(List<NewWord> newWords) {
72+
if (newWords.size() == 0)
73+
return;
74+
75+
for (NewWord newWord : newWords) {
76+
newWord.setScore(-1);
77+
addTerm(newWord);
78+
}
79+
}
80+
81+
/**
82+
* 增加一个新词到树中
83+
*
84+
* @param newWord
85+
*/
86+
public void addTerm(NewWord newWord) {
87+
NewWord temp = null;
88+
SmartForest<NewWord> smartForest = null;
89+
if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) {
90+
temp = smartForest.getParam();
91+
temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq());
92+
} else {
93+
count++;
94+
// 设置名字为空,节省内存空间
95+
synchronized (sf) {
96+
sf.add(newWord.getName(), newWord);
97+
}
98+
}
99+
}
100+
101+
public SmartForest<NewWord> getForest() {
102+
return this.sf;
103+
}
104+
105+
/**
106+
* 返回学习到的新词.
107+
*
108+
* @param num
109+
* 返回数目.0为全部返回
110+
* @return
111+
*/
112+
public List<Entry<String, Double>> getTopTree(int num) {
113+
return getTopTree(num, null);
114+
}
115+
116+
public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) {
117+
if (sf.branches == null) {
118+
return null;
119+
}
120+
HashMap<String, Double> hm = new HashMap<String, Double>();
121+
for (int i = 0; i < sf.branches.length; i++) {
122+
valueResult(sf.branches[i], hm, nature);
123+
}
124+
List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
125+
if (num == 0) {
126+
return sortMapByValue;
127+
} else {
128+
num = Math.min(num, sortMapByValue.size());
129+
return sortMapByValue.subList(0, num);
130+
}
131+
}
132+
133+
private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, TermNatures nature) {
134+
// TODO Auto-generated method stub
135+
if (smartForest == null || smartForest.branches == null) {
136+
return;
137+
}
138+
for (int i = 0; i < smartForest.branches.length; i++) {
139+
NewWord param = smartForest.branches[i].getParam();
140+
if (smartForest.branches[i].getStatus() == 3) {
141+
if (nature == null || param.getNature().equals(nature)) {
142+
hm.put(param.getName(), param.getScore());
143+
}
144+
} else if (smartForest.branches[i].getStatus() == 2) {
145+
if (nature == null || param.getNature().equals(nature)) {
146+
hm.put(param.getName(), param.getScore());
147+
}
148+
valueResult(smartForest.branches[i], hm, nature);
149+
} else {
150+
valueResult(smartForest.branches[i], hm, nature);
151+
}
152+
}
153+
}
170154
}

src/main/java/org/ansj/domain/CompanyNatureAttr.java

Lines changed: 0 additions & 68 deletions
This file was deleted.

0 commit comments

Comments
 (0)