|
10 | 10 | import org.ansj.domain.NewWord;
|
11 | 11 | import org.ansj.domain.TermNatures;
|
12 | 12 | import org.ansj.recognition.AsianPersonRecognition;
|
13 |
| -import org.ansj.recognition.CompanyRecogntion; |
14 | 13 | import org.ansj.recognition.ForeignPersonRecognition;
|
15 | 14 | import org.ansj.util.Graph;
|
16 | 15 |
|
|
22 | 21 | */
|
23 | 22 | public class LearnTool {
|
24 | 23 |
|
25 |
| - /** |
26 |
| - * 是否开启学习机 |
27 |
| - */ |
28 |
| - public boolean isCompany = true; |
29 |
| - |
30 |
| - public boolean isAsianName = true; |
31 |
| - |
32 |
| - public boolean isForeignName = true; |
33 |
| - |
34 |
| - /** |
35 |
| - * 告诉大家你学习了多少个词了 |
36 |
| - */ |
37 |
| - public int count; |
38 |
| - |
39 |
| - /** |
40 |
| - * 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做. |
41 |
| - */ |
42 |
| - private final SmartForest<NewWord> sf = new SmartForest<NewWord>(); |
43 |
| - |
44 |
| - /** |
45 |
| - * 公司名称学习. |
46 |
| - * |
47 |
| - * @param graph |
48 |
| - */ |
49 |
| - public void learn(Graph graph) { |
50 |
| - |
51 |
| - // 亚洲人名识别 |
52 |
| - if (isAsianName) { |
53 |
| - findAsianPerson(graph); |
54 |
| - } |
55 |
| - |
56 |
| - // 外国人名识别 |
57 |
| - if (isForeignName) { |
58 |
| - findForeignPerson(graph); |
59 |
| - } |
60 |
| - |
61 |
| - } |
62 |
| - |
63 |
| - private void findAsianPerson(Graph graph) { |
64 |
| - List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords(); |
65 |
| - addListToTerm(newWords); |
66 |
| - } |
67 |
| - |
68 |
| - private void findForeignPerson(Graph graph) { |
69 |
| - List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords(); |
70 |
| - addListToTerm(newWords); |
71 |
| - } |
72 |
| - |
73 |
| - /** |
74 |
| - * 公司名称查找 |
75 |
| - * |
76 |
| - * @param graph |
77 |
| - */ |
78 |
| - private void findCompany(Graph graph) { |
79 |
| - List<NewWord> newWords = new CompanyRecogntion(graph.terms).getNewWords(); |
80 |
| - addListToTerm(newWords); |
81 |
| - } |
82 |
| - |
83 |
| - // 批量将新词加入到词典中 |
84 |
| - private void addListToTerm(List<NewWord> newWords) { |
85 |
| - if (newWords.size() == 0) |
86 |
| - return; |
87 |
| - |
88 |
| - for (NewWord newWord : newWords) { |
89 |
| - newWord.setScore(-1); |
90 |
| - addTerm(newWord); |
91 |
| - } |
92 |
| - } |
93 |
| - |
94 |
| - |
95 |
| - /** |
96 |
| - * 增加一个新词到树中 |
97 |
| - * |
98 |
| - * @param newWord |
99 |
| - */ |
100 |
| - public void addTerm(NewWord newWord) { |
101 |
| - NewWord temp = null; |
102 |
| - SmartForest<NewWord> smartForest = null; |
103 |
| - if ((smartForest = sf.getBranch(newWord.getName())) != null |
104 |
| - && smartForest.getParam() != null) { |
105 |
| - temp = smartForest.getParam(); |
106 |
| - temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq()); |
107 |
| - } else { |
108 |
| - count++; |
109 |
| - // 设置名字为空,节省内存空间 |
110 |
| - synchronized (sf) { |
111 |
| - sf.add(newWord.getName(), newWord); |
112 |
| - } |
113 |
| - } |
114 |
| - } |
115 |
| - |
116 |
| - public SmartForest<NewWord> getForest() { |
117 |
| - return this.sf; |
118 |
| - } |
119 |
| - |
120 |
| - /** |
121 |
| - * 返回学习到的新词. |
122 |
| - * |
123 |
| - * @param num |
124 |
| - * 返回数目.0为全部返回 |
125 |
| - * @return |
126 |
| - */ |
127 |
| - public List<Entry<String, Double>> getTopTree(int num) { |
128 |
| - return getTopTree(num, null); |
129 |
| - } |
130 |
| - |
131 |
| - public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) { |
132 |
| - if (sf.branches == null) { |
133 |
| - return null; |
134 |
| - } |
135 |
| - HashMap<String, Double> hm = new HashMap<String, Double>(); |
136 |
| - for (int i = 0; i < sf.branches.length; i++) { |
137 |
| - valueResult(sf.branches[i], hm, nature); |
138 |
| - } |
139 |
| - List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1); |
140 |
| - if (num == 0) { |
141 |
| - return sortMapByValue; |
142 |
| - } else { |
143 |
| - num = Math.min(num, sortMapByValue.size()); |
144 |
| - return sortMapByValue.subList(0, num); |
145 |
| - } |
146 |
| - } |
147 |
| - |
148 |
| - private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, |
149 |
| - TermNatures nature) { |
150 |
| - // TODO Auto-generated method stub |
151 |
| - if (smartForest == null || smartForest.branches==null) { |
152 |
| - return ; |
153 |
| - } |
154 |
| - for (int i = 0; i < smartForest.branches.length; i++) { |
155 |
| - NewWord param = smartForest.branches[i].getParam(); |
156 |
| - if (smartForest.branches[i].getStatus() == 3) { |
157 |
| - if (nature == null || param.getNature().equals(nature)) { |
158 |
| - hm.put(param.getName(), param.getScore()); |
159 |
| - } |
160 |
| - } else if (smartForest.branches[i].getStatus() == 2) { |
161 |
| - if (nature == null || param.getNature().equals(nature)) { |
162 |
| - hm.put(param.getName(), param.getScore()); |
163 |
| - } |
164 |
| - valueResult(smartForest.branches[i], hm, nature); |
165 |
| - } else { |
166 |
| - valueResult(smartForest.branches[i], hm, nature); |
167 |
| - } |
168 |
| - } |
169 |
| - } |
| 24 | + /** |
| 25 | + * 是否开启学习机 |
| 26 | + */ |
| 27 | + public boolean isAsianName = true; |
| 28 | + |
| 29 | + public boolean isForeignName = true; |
| 30 | + |
| 31 | + /** |
| 32 | + * 告诉大家你学习了多少个词了 |
| 33 | + */ |
| 34 | + public int count; |
| 35 | + |
| 36 | + /** |
| 37 | + * 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做. |
| 38 | + */ |
| 39 | + private final SmartForest<NewWord> sf = new SmartForest<NewWord>(); |
| 40 | + |
| 41 | + /** |
| 42 | + * 公司名称学习. |
| 43 | + * |
| 44 | + * @param graph |
| 45 | + */ |
| 46 | + public void learn(Graph graph) { |
| 47 | + |
| 48 | + // 亚洲人名识别 |
| 49 | + if (isAsianName) { |
| 50 | + findAsianPerson(graph); |
| 51 | + } |
| 52 | + |
| 53 | + // 外国人名识别 |
| 54 | + if (isForeignName) { |
| 55 | + findForeignPerson(graph); |
| 56 | + } |
| 57 | + |
| 58 | + } |
| 59 | + |
| 60 | + private void findAsianPerson(Graph graph) { |
| 61 | + List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords(); |
| 62 | + addListToTerm(newWords); |
| 63 | + } |
| 64 | + |
| 65 | + private void findForeignPerson(Graph graph) { |
| 66 | + List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords(); |
| 67 | + addListToTerm(newWords); |
| 68 | + } |
| 69 | + |
| 70 | + // 批量将新词加入到词典中 |
| 71 | + private void addListToTerm(List<NewWord> newWords) { |
| 72 | + if (newWords.size() == 0) |
| 73 | + return; |
| 74 | + |
| 75 | + for (NewWord newWord : newWords) { |
| 76 | + newWord.setScore(-1); |
| 77 | + addTerm(newWord); |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + /** |
| 82 | + * 增加一个新词到树中 |
| 83 | + * |
| 84 | + * @param newWord |
| 85 | + */ |
| 86 | + public void addTerm(NewWord newWord) { |
| 87 | + NewWord temp = null; |
| 88 | + SmartForest<NewWord> smartForest = null; |
| 89 | + if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) { |
| 90 | + temp = smartForest.getParam(); |
| 91 | + temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq()); |
| 92 | + } else { |
| 93 | + count++; |
| 94 | + // 设置名字为空,节省内存空间 |
| 95 | + synchronized (sf) { |
| 96 | + sf.add(newWord.getName(), newWord); |
| 97 | + } |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + public SmartForest<NewWord> getForest() { |
| 102 | + return this.sf; |
| 103 | + } |
| 104 | + |
| 105 | + /** |
| 106 | + * 返回学习到的新词. |
| 107 | + * |
| 108 | + * @param num |
| 109 | + * 返回数目.0为全部返回 |
| 110 | + * @return |
| 111 | + */ |
| 112 | + public List<Entry<String, Double>> getTopTree(int num) { |
| 113 | + return getTopTree(num, null); |
| 114 | + } |
| 115 | + |
| 116 | + public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) { |
| 117 | + if (sf.branches == null) { |
| 118 | + return null; |
| 119 | + } |
| 120 | + HashMap<String, Double> hm = new HashMap<String, Double>(); |
| 121 | + for (int i = 0; i < sf.branches.length; i++) { |
| 122 | + valueResult(sf.branches[i], hm, nature); |
| 123 | + } |
| 124 | + List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1); |
| 125 | + if (num == 0) { |
| 126 | + return sortMapByValue; |
| 127 | + } else { |
| 128 | + num = Math.min(num, sortMapByValue.size()); |
| 129 | + return sortMapByValue.subList(0, num); |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, TermNatures nature) { |
| 134 | + // TODO Auto-generated method stub |
| 135 | + if (smartForest == null || smartForest.branches == null) { |
| 136 | + return; |
| 137 | + } |
| 138 | + for (int i = 0; i < smartForest.branches.length; i++) { |
| 139 | + NewWord param = smartForest.branches[i].getParam(); |
| 140 | + if (smartForest.branches[i].getStatus() == 3) { |
| 141 | + if (nature == null || param.getNature().equals(nature)) { |
| 142 | + hm.put(param.getName(), param.getScore()); |
| 143 | + } |
| 144 | + } else if (smartForest.branches[i].getStatus() == 2) { |
| 145 | + if (nature == null || param.getNature().equals(nature)) { |
| 146 | + hm.put(param.getName(), param.getScore()); |
| 147 | + } |
| 148 | + valueResult(smartForest.branches[i], hm, nature); |
| 149 | + } else { |
| 150 | + valueResult(smartForest.branches[i], hm, nature); |
| 151 | + } |
| 152 | + } |
| 153 | + } |
170 | 154 | }
|
0 commit comments