Skip to content

Commit 450c21c

Browse files
committed
Merge pull request #17 from tesseract2048/master
Optimize MurmurHash, benchmarked 33% faster Adding the serialVersionUID for incompatible problem.
2 parents 6549226 + 770f0cf commit 450c21c

File tree

5 files changed

+146
-93
lines changed

5 files changed

+146
-93
lines changed

fnlp-core/src/main/java/org/fnlp/nlp/cn/tag/CWSTagger.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ public static void main(String[] args) throws Exception {
229229
HelpFormatter f = new HelpFormatter();
230230
f.printHelp(
231231
"SEG:\n"
232-
+ "java edu.fudan.nlp.tag.CWSTagger -f model_file input_file output_file;\n"
233-
+ "java edu.fudan.nlp.tag.CWSTagger -s model_file string_to_segement",
232+
+ "java org.fnlp.tag.CWSTagger -f model_file input_file output_file;\n"
233+
+ "java org.fnlp.tag.CWSTagger -s model_file string_to_segement",
234234
opt);
235235
return;
236236
}

fnlp-core/src/main/java/org/fnlp/util/MyStrings.java

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
/**
2-
* This file is part of FNLP (formerly FudanNLP).
3-
*
4-
* FNLP is free software: you can redistribute it and/or modify
5-
* it under the terms of the GNU Lesser General Public License as published by
6-
* the Free Software Foundation, either version 3 of the License, or
7-
* (at your option) any later version.
8-
*
9-
* FNLP is distributed in the hope that it will be useful,
10-
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11-
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12-
* GNU Lesser General Public License for more details.
13-
*
14-
* You should have received a copy of the GNU General Public License
15-
* along with FudanNLP. If not, see <http://www.gnu.org/licenses/>.
16-
*
17-
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
18-
*/
2+
* This file is part of FNLP (formerly FudanNLP).
3+
*
4+
* FNLP is free software: you can redistribute it and/or modify
5+
* it under the terms of the GNU Lesser General Public License as published by
6+
* the Free Software Foundation, either version 3 of the License, or
7+
* (at your option) any later version.
8+
*
9+
* FNLP is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU Lesser General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with FudanNLP. If not, see <http://www.gnu.org/licenses/>.
16+
*
17+
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
18+
*/
1919

2020
package org.fnlp.util;
2121

@@ -116,9 +116,9 @@ public static String toString(float[] s, String delim) {
116116
sb.append(delim);
117117
}
118118
return sb.toString();
119-
119+
120120
}
121-
121+
122122
/**
123123
* 字符在字符串中出现的次数
124124
*
@@ -155,8 +155,28 @@ public static String toString(float[][] s) {
155155
sb.append("\n");
156156
}
157157
return sb.toString();
158+
}
159+
/**
160+
* 统计文本中某个字符串出现的次数
161+
* @param str
162+
* @param target
163+
* @return
164+
* 下午5:33:24
165+
*/
166+
public static int count(String str,String target){
167+
int count=0;
168+
int index=0;
169+
while(true){
170+
index=str.indexOf(target,index+1);
171+
if(index>0){
172+
count++;
173+
}else{
174+
break;
175+
}
176+
}
177+
return count;
158178
}
159179

160-
180+
161181

162182
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/**
2+
*
3+
*/
4+
package org.fnlp.util;
5+
6+
import java.util.ArrayList;
7+
import java.util.HashMap;
8+
9+
/**
10+
* @author Xipeng Qiu E-mail: xpqiu@fudan.edu.cn
11+
* @version 创建时间:2015年1月6日 下午4:39:23
12+
* @since fnlp 2.1
13+
*/
14+
public class Options {
15+
HashMap<String,String> options = new HashMap<String,String>();
16+
ArrayList<String> rootArgs = new ArrayList<String>();
17+
18+
19+
public void parsing(String[] args) {
20+
21+
for (int i=0; i<args.length; ++i) {
22+
if (args[i].charAt(0) != '-') {
23+
rootArgs.add(args[i]);
24+
} else if (i+1 < args.length) {
25+
options.put(args[i], args[++i]);
26+
} else {
27+
throw new IllegalArgumentException();
28+
}
29+
}
30+
}
31+
32+
}

fnlp-core/src/main/java/org/fnlp/util/hash/MurmurHash.java

Lines changed: 18 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,7 @@
1919

2020
package org.fnlp.util.hash;
2121

22-
import java.io.UnsupportedEncodingException;
23-
24-
25-
/**
22+
/**
2623
*
2724
* murmur hash 2.0.
2825
*
@@ -40,12 +37,27 @@
4037
*
4138
*/
4239
public final class MurmurHash extends AbstractHashCode {
40+
41+
private static final long serialVersionUID = 4342869264396184799L;
4342

4443
// all methods static; private constructor.
4544
public MurmurHash() {}
4645

46+
protected byte[] toBytesWithoutEncoding(String str) {
47+
int len = str.length();
48+
int pos = 0;
49+
byte[] buf = new byte[len << 1];
50+
for (int i = 0; i < len; i++) {
51+
char c = str.charAt(i);
52+
buf[pos++] = (byte) (c & 0xFF);
53+
buf[pos++] = (byte) (c >> 8);
54+
}
55+
return buf;
56+
}
57+
4758
public int hashcode(String str) {
48-
return hash32(str);
59+
byte[] bytes = toBytesWithoutEncoding(str);
60+
return hash32(bytes, bytes.length);
4961
}
5062

5163
/**
@@ -106,31 +118,6 @@ public int hash32( final byte[] data, int length) {
106118
}
107119

108120

109-
/**
110-
* Generates 32 bit hash from a string.
111-
*
112-
* @param text string to hash
113-
* @return 32 bit hash of the given string
114-
*/
115-
public int hash32( final String text) {
116-
final byte[] bytes = text.getBytes();
117-
return hash32( bytes, bytes.length);
118-
}
119-
120-
121-
/**
122-
* Generates 32 bit hash from a substring.
123-
*
124-
* @param text string to hash
125-
* @param from starting index
126-
* @param length length of the substring to hash
127-
* @return 32 bit hash of the given string
128-
*/
129-
public int hash32( final String text, int from, int length) {
130-
return hash32( text.substring( from, from+length));
131-
}
132-
133-
134121
/**
135122
* Generates 64 bit hash from byte array of the given length and seed.
136123
*
@@ -192,44 +179,4 @@ public long hash64( final byte[] data, int length) {
192179
return hash64( data, length, 0xe17a1465);
193180
}
194181

195-
196-
/**
197-
* Generates 64 bit hash from a string.
198-
*
199-
* @param text string to hash
200-
* @return 64 bit hash of the given string
201-
*/
202-
public long hash64( final String text) {
203-
204-
final byte[] bytes;
205-
try {
206-
bytes = text.getBytes("utf-8");
207-
208-
return hash32( bytes, bytes.length);
209-
} catch (UnsupportedEncodingException e) {
210-
throw new RuntimeException("utf-8 encoding should be available on system.", e);
211-
}
212-
}
213-
214-
215-
216-
/**
217-
* Generates 64 bit hash from a substring.
218-
*
219-
* @param text string to hash
220-
* @param from starting index
221-
* @param length length of the substring to hash
222-
* @return 64 bit hash of the given array
223-
*/
224-
public long hash64( final String text, int from, int length) {
225-
226-
final byte[] bytes;
227-
try {
228-
bytes = text.getBytes("utf-8");
229-
230-
return hash64( bytes, bytes.length);
231-
} catch (UnsupportedEncodingException e) {
232-
throw new RuntimeException("utf-8 encoding should be available on system.", e);
233-
}
234-
}
235-
}
182+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/**
2+
*
3+
*/
4+
package org.fnlp.util.hash;
5+
6+
import org.fnlp.nlp.cn.tag.CWSTagger;
7+
import org.fnlp.util.exception.LoadModelException;
8+
9+
/**
10+
* @author Xipeng Qiu E-mail: xpqiu@fudan.edu.cn
11+
* @version 创建时间:2015年1月8日 下午3:14:28
12+
*/
13+
public class MurmurHashTest {
14+
15+
private CWSTagger cws;
16+
17+
public MurmurHashTest() {
18+
try {
19+
long t = System.currentTimeMillis();
20+
cws = new CWSTagger("../models/seg.m");
21+
long elapsed = System.currentTimeMillis() - t;
22+
System.out.println("Fnlp loaded in " + elapsed + " ms.");
23+
} catch (LoadModelException e) {
24+
throw new RuntimeException("Failed to load fnlp", e);
25+
}
26+
}
27+
28+
29+
protected String[] getCws(String text) {
30+
return cws.tag2Array(text);
31+
}
32+
33+
protected void benchmark() {
34+
long t = System.currentTimeMillis();
35+
String input = "12月1日,长江经济带海关区域通关一体化改革实现流域全覆盖,南昌、武汉、长沙、成都、重庆、贵阳、昆明等7个海关加入改革。当天,流域12个关区的海关特殊监管区域也纳入区域通关一体化,长江全流域真正实现了“12关如1关”。这标志着京津冀、长江经济带、广东地区三大区域通关一体化改革全面实施";
36+
for (int i = 0; i < 5000; i++) {
37+
getCws(input);
38+
}
39+
long elapsed = System.currentTimeMillis() - t;
40+
System.out.println("Benchmarked " + elapsed + " ms.");
41+
}
42+
43+
public void run() {
44+
// warm up the code, and perform benchmark
45+
for (int k = 0; k < 10; k ++) {
46+
benchmark();
47+
}
48+
}
49+
50+
public static void main(String[] args) throws InterruptedException {
51+
new MurmurHashTest().run();
52+
}
53+
}
54+

0 commit comments

Comments
 (0)