Skip to content

Commit 6623f35

Browse files
committed
维护
1 parent 44714d2 commit 6623f35

File tree

11 files changed

+106
-18
lines changed

11 files changed

+106
-18
lines changed

fnlp-core/src/main/java/org/fnlp/ml/types/sv/HashSparseVector.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ public float dotProduct(HashSparseVector sv) {
207207
}
208208

209209
/* (non-Javadoc)
210-
* @see edu.fudan.ml.types.ISparseVector#dotProduct(float[])
210+
* @see org.fnlp.ml.types.sv.ISparseVector#dotProduct(float[])
211211
*/
212212
@Override
213213
public float dotProduct(float[] vector) {
@@ -221,7 +221,7 @@ public float dotProduct(float[] vector) {
221221
}
222222

223223
/* (non-Javadoc)
224-
* @see edu.fudan.ml.types.ISparseVector#l2Norm2()
224+
* @see org.fnlp.ml.types.sv.ISparseVector#l2Norm2()
225225
*/
226226
public float l2Norm2() {
227227
TIntFloatIterator it = data.iterator();

fnlp-core/src/main/java/org/fnlp/ml/types/sv/ISparseVector.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@
1919

2020
package org.fnlp.ml.types.sv;
2121

22-
import java.io.Serializable;
22+
import java.io.Serializable;
23+
2324

25+
/**
26+
* 稀疏向量,并实现各种向量运算
27+
*
28+
*/
2429
public interface ISparseVector extends Serializable {
2530

2631
/**

fnlp-core/src/main/java/org/fnlp/nlp/corpus/Tags.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,9 @@ public static String genSequence4Tags(String[] wordArray){
6767
String word = wordArray[i];
6868
for(int j=0; j<word.length(); j++) {
6969
char c = word.charAt(j);
70-
if(Chars.getType(c)==Chars.CharType.B)
71-
System.err.println("包含空格");
70+
if(Chars.getType(c)==Chars.CharType.B){
71+
System.err.println(word + " :包含空格(将序列标签转为BMES)");
72+
}
7273
sb.append(c);
7374
sb.append('\t');
7475
if(j == 0) {

fnlp-core/src/main/java/org/fnlp/nlp/corpus/fnlp/FNLPCorpus.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,9 +367,20 @@ public void readOurCorpus(String path, String suffix, String charset) throws IOE
367367
}
368368
}
369369

370-
public int getDocumenSize() {
370+
public int getDocumenNum() {
371371

372372
return docs.size();
373+
}
374+
375+
376+
public int getSentenceNum() {
377+
Iterator<FNLPDoc> it1 = docs.iterator();
378+
int n=0;
379+
while(it1.hasNext()){
380+
FNLPDoc doc = it1.next();
381+
n += doc.sentences.size();
382+
}
383+
return n;
373384
}
374385

375386
public FNLPDoc getDoc(int idx) {
@@ -401,6 +412,8 @@ public TreeSet<String> getAllPOS() {
401412
}
402413
return set;
403414

404-
}
415+
}
416+
417+
405418

406419
}

fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/reader/FNLPReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ public class FNLPReader extends Reader {
4848
public FNLPReader(String filepath) throws IOException {
4949
corpus = new FNLPCorpus();
5050
corpus.read(filepath, null);
51-
size = corpus.getDocumenSize();
51+
size = corpus.getDocumenNum();
5252
curDocNo = 0;
5353
curSentNo = 0;
5454

fnlp-core/src/main/java/org/fnlp/nlp/pipe/NGram.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,13 @@ private ArrayList<String> ngram(String[] strs, int[] grams) {
8484
}
8585
return list;
8686
}
87-
87+
88+
/**
89+
* 提取ngram
90+
* @param tokens
91+
* @param gramSizes2
92+
* @return
93+
*/
8894
private ArrayList<String> ngram(List tokens, int[] gramSizes2) {
8995
ArrayList<String> list = new ArrayList<String>();
9096
StringBuffer buf = new StringBuffer();
@@ -131,7 +137,13 @@ public static Set<String> ngramSet(String data,int[] gramSizes) {
131137
ngram(data, gramSizes, list);
132138
return list;
133139
}
134-
140+
141+
/**
142+
* 提取ngram
143+
* @param data
144+
* @param gramSizes
145+
* @param list
146+
*/
135147
private static void ngram(String data, int[] gramSizes, Collection<String> list) {
136148
for (int j = 0; j < gramSizes.length; j++) {
137149
int len = gramSizes[j];

fnlp-demo/src/main/java/org/fnlp/demo/nlp/TextClassification.java renamed to fnlp-demo/src/main/java/org/fnlp/demo/nlp/tc/TextClassificationCustom.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
1818
*/
1919

20-
package org.fnlp.demo.nlp;
20+
package org.fnlp.demo.nlp.tc;
2121

2222
import java.io.File;
2323

@@ -36,12 +36,13 @@
3636
import org.fnlp.nlp.pipe.Target2Label;
3737

3838
/**
39-
* 文本分类示例
39+
* 自定义流程的文本分类示例
40+
* 不使用封装好的org.fnlp.app.tc.TextClassifier类
4041
* @author xpqiu
4142
*
4243
*/
4344

44-
public class TextClassification {
45+
public class TextClassificationCustom {
4546

4647
/**
4748
* 训练数据路径
@@ -122,6 +123,8 @@ public static void main(String[] args) throws Exception {
122123
else
123124
System.err.println(gold_label+"->"+pred_label+" : "+testset.getInstance(i).getSource());
124125
}
126+
127+
125128
/**
126129
* 分类器使用
127130
*/

fnlp-demo/src/main/java/org/fnlp/demo/nlp/TextClassificationSimple.java renamed to fnlp-demo/src/main/java/org/fnlp/demo/nlp/tc/TextClassificationSimple.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
1818
*/
1919

20-
package org.fnlp.demo.nlp;
20+
package org.fnlp.demo.nlp.tc;
2121

2222
import java.io.File;
2323

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package org.fnlp.train.prepare;
2+
3+
import java.util.LinkedList;
4+
import java.util.List;
5+
6+
import org.fnlp.nlp.corpus.fnlp.FNLPCorpus;
7+
import org.fnlp.nlp.corpus.fnlp.FNLPDoc;
8+
import org.fnlp.nlp.corpus.fnlp.FNLPSent;
9+
10+
public class Corpus {
11+
12+
public static void main(String[] args) throws Exception {
13+
14+
String datapath = "../data";
15+
FNLPCorpus corpus = new FNLPCorpus();
16+
corpus.read(datapath + "/FNLPDATA/WeiboFTB(v1.0).dat", null);
17+
18+
System.out.println(corpus.getDocumenNum());
19+
System.out.println(corpus.getSentenceNum());
20+
System.out.println(corpus.getAllPOS());
21+
22+
FNLPDoc doc = corpus.docs.get(0);
23+
List<FNLPSent> train = doc.sentences.subList(0, 3000);
24+
List<FNLPSent> test = doc.sentences.subList(3000,doc.sentences.size());
25+
26+
doc.sentences = new LinkedList<FNLPSent>();
27+
doc.sentences.addAll(train);
28+
corpus.writeOne(datapath + "/FNLPDATA/WeiboFTB(v1.0)-train.dat");
29+
System.out.println(corpus.getSentenceNum());
30+
System.out.println(corpus.getAllPOS().size());
31+
32+
33+
doc.sentences = new LinkedList<FNLPSent>();
34+
doc.sentences.addAll(test);
35+
corpus.writeOne(datapath + "/FNLPDATA/WeiboFTB(v1.0)-test.dat");
36+
System.out.println(corpus.getSentenceNum());
37+
System.out.println(corpus.getAllPOS().size());
38+
39+
40+
41+
}
42+
43+
}

fnlp-train/src/main/java/org/fnlp/train/prepare/PrepareSeg.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,13 @@ public static void main(String[] args) throws Exception {
6161
//读分词+词性文件
6262
corpus.readPOS(datapath + "/FNLPDATA/pos",".txt","UTF8");
6363
//读FNLP数据
64-
corpus.read(datapath + "/FNLPDATA/ctb7.dat", null);
64+
corpus.read(datapath + "/FNLPDATA/ctb7.dat", null);
65+
corpus.read(datapath + "/FNLPDATA/WeiboFTB(v1.0)-train.dat", null);
6566

6667

6768

6869
FNLP2BMES.w2BMES(corpus,segfile);
69-
FNLP2BMES.w2BMES(corpus,segfile_w);
70+
//FNLP2BMES.w2BMES(corpus,segfile_w); //?
7071

7172

7273
//词典转BMES
@@ -107,13 +108,23 @@ public static void main(String[] args) throws Exception {
107108
dictfile = datapath + "/FNLPDATA/all.seg";
108109
String dicfile = datapath + "/FNLPDATA/all.dict";
109110
DICT.BMES2DICT(dictfile,dicfile);
111+
112+
//处理测试数据
113+
FNLPCorpus corpust = new FNLPCorpus();
114+
//读自有数据
115+
corpust.read(datapath + "/FNLPDATA/WeiboFTB(v1.0)-test.dat", null);
116+
String testfile = datapath + "/FNLPDATA/test.seg";
117+
FNLP2BMES.w2BMES(corpust,testfile);
110118

111119

112120
System.out.println(new Date().toString());
113121
System.out.println("Done!");
114122

115123
String param = "-iter 100 -c 0.01 ../data/template-seg ../data/FNLPDATA/all.seg ../models/seg.m";
116124
CWSTrain.main(param.split(" +"));
125+
126+
param = "../models/seg.m ../data/FNLPDATA/test.seg";
127+
CWSTrain.main(param.split(" +"));
117128

118129

119130
}

0 commit comments

Comments
 (0)