Skip to content

Commit 8f9c633

Browse files
committed
fix bug
1 parent ef50991 commit 8f9c633

File tree

7 files changed

+64
-73
lines changed

7 files changed

+64
-73
lines changed

fnlp-core/src/main/java/org/fnlp/data/reader/DocumentReader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
* 不同类别的文件放在不同的子文件夹下
4141
* 类别:子文件夹名
4242
* 数据:文件内所有字符
43-
* package edu.fudan.ml.data
4443
*/
4544
public class DocumentReader extends Reader {
4645

@@ -87,7 +86,8 @@ private void nextDocument() {
8786
} catch (IOException e) {
8887
e.printStackTrace();
8988
}
90-
cur = new Instance(buff.toString(), f.getPath());
89+
String label = f.getParentFile().getName();
90+
cur = new Instance(buff.toString(), label);
9191
buff = null;
9292
}
9393
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/**
2+
* This file is part of FNLP (formerly FudanNLP).
3+
*
4+
* FNLP is free software: you can redistribute it and/or modify
5+
* it under the terms of the GNU Lesser General Public License as published by
6+
* the Free Software Foundation, either version 3 of the License, or
7+
* (at your option) any later version.
8+
*
9+
* FNLP is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU Lesser General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with FudanNLP. If not, see <http://www.gnu.org/licenses/>.
16+
*
17+
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
18+
*/
19+
20+
/**
21+
* 数据读取包:处理不同类型格式的数据。
22+
*/
23+
package org.fnlp.data.reader;

fnlp-core/src/main/java/org/fnlp/data/reader/package.html

Lines changed: 0 additions & 55 deletions
This file was deleted.

fnlp-core/src/main/java/org/fnlp/ml/types/sv/BinarySparseVector.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121

2222
import gnu.trove.iterator.TIntIterator;
2323
import gnu.trove.list.array.TIntArrayList;
24-
24+
/**
25+
* 0/1取值的稀疏向量
26+
* @author xpqiu
27+
*/
2528
public class BinarySparseVector implements ISparseVector {
2629

2730

@@ -42,7 +45,7 @@ public float dotProduct(float[] vector) {
4245
float sum = 0f;
4346
while(it.hasNext()){
4447
int i = it.next();
45-
if(i<0)
48+
if(i<0||i>=vector.length)
4649
continue;
4750
sum += vector[i];
4851
}

fnlp-core/src/main/java/org/fnlp/nlp/duplicate/FingerPrint.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ public static TreeSet<String> print(String s) {
5656
}
5757

5858
public static String ngram(String ss,int n) {
59-
List l = NGram.ngram(ss, new int[]{n});
59+
List l = NGram.ngramOnCharacter2List(ss, new int[]{n});
6060
StringBuilder sb = new StringBuilder();
6161
for(int i=0;i<l.size();i++){
6262
sb.append(l.get(i));
@@ -66,7 +66,7 @@ public static String ngram(String ss,int n) {
6666
}
6767

6868
public static Set<String> ngramSet(String ss,int n) {
69-
return NGram.ngramSet(ss, new int[]{n});
69+
return NGram.ngramOnCharacter2Set(ss, new int[]{n});
7070
}
7171

7272
public static String feature(String s, Type t) {

fnlp-core/src/main/java/org/fnlp/nlp/pipe/NGram.java

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import java.util.HashSet;
2727
import java.util.List;
2828
import java.util.Set;
29-
29+
3030
import org.fnlp.ml.types.Instance;
3131
import org.fnlp.util.exception.UnsupportedDataTypeException;
3232

@@ -40,7 +40,16 @@
4040
public class NGram extends Pipe{
4141

4242
private static final long serialVersionUID = -2329969202592736092L;
43-
int[] gramSizes = null;
43+
int[] gramSizes = null;
44+
/**
45+
* 用基于字而不是词的Ngram
46+
*/
47+
private boolean useCharacter = false;
48+
49+
public NGram(int[] sizes,boolean useCharacter) {
50+
this.gramSizes = sizes;
51+
this.useCharacter = useCharacter;
52+
}
4453

4554
public NGram(int[] sizes) {
4655
this.gramSizes = sizes;
@@ -50,8 +59,13 @@ public NGram(int[] sizes) {
5059
public void addThruPipe(Instance inst) throws UnsupportedDataTypeException {
5160
Object data = inst.getData();
5261
ArrayList<String> list = null;
53-
if (data instanceof String) {
54-
list = ngram((String) data,gramSizes);
62+
if (data instanceof String) {
63+
if(useCharacter)
64+
list = ngramOnCharacter2List((String) data,gramSizes);
65+
else{
66+
list = ngram((String) data,gramSizes);
67+
}
68+
5569
}else if (data instanceof List) {
5670
list = ngram((List) data,gramSizes);
5771
}else if(data instanceof String[]){
@@ -63,7 +77,12 @@ public void addThruPipe(Instance inst) throws UnsupportedDataTypeException {
6377
}
6478

6579

66-
/**
80+
private ArrayList<String> ngram(String str, int[] gramSizes) {
81+
String[] toks = str.split("\\s+");
82+
return ngram(toks,gramSizes);
83+
}
84+
85+
/**
6786
* 抽取ngram
6887
* @param strs
6988
* @param grams
@@ -123,10 +142,10 @@ private ArrayList<String> ngram(List tokens, int[] gramSizes2) {
123142
* @param gramSizes
124143
* @return ngram字符串数组
125144
*/
126-
public static ArrayList<String> ngram(String data,int[] gramSizes) {
145+
public static ArrayList<String> ngramOnCharacter2List(String data,int[] gramSizes) {
127146
// 提取ngram
128147
ArrayList<String> list = new ArrayList<String>();
129-
ngram(data, gramSizes, list);
148+
ngramOnCharacter(data, gramSizes, list);
130149
return list;
131150
}
132151

@@ -136,10 +155,10 @@ public static ArrayList<String> ngram(String data,int[] gramSizes) {
136155
* @param gramSizes
137156
* @return ngram字符串集合
138157
*/
139-
public static Set<String> ngramSet(String data,int[] gramSizes) {
158+
public static Set<String> ngramOnCharacter2Set(String data,int[] gramSizes) {
140159
// 提取ngram
141160
Set<String> list = new HashSet<String>();
142-
ngram(data, gramSizes, list);
161+
ngramOnCharacter(data, gramSizes, list);
143162
return list;
144163
}
145164

@@ -149,7 +168,7 @@ public static Set<String> ngramSet(String data,int[] gramSizes) {
149168
* @param gramSizes
150169
* @param list
151170
*/
152-
private static void ngram(String data, int[] gramSizes, Collection<String> list) {
171+
private static void ngramOnCharacter(String data, int[] gramSizes, Collection<String> list) {
153172
for (int j = 0; j < gramSizes.length; j++) {
154173
int len = gramSizes[j];
155174
if (len <= 0 || len > data.length())

fnlp-demo/src/main/java/org/fnlp/demo/nlp/tc/TextClassificationCustom1.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,18 @@ public static void main(String[] args) throws Exception {
6666
Pipe targetpp = new Target2Label(af.DefaultLabelAlphabet());
6767

6868
//建立pipe组合
69-
SeriesPipes pp = new SeriesPipes(new Pipe[]{ngrampp,targetpp,indexpp});
69+
SeriesPipes pp = new SeriesPipes(new Pipe[]{targetpp,ngrampp,indexpp});
7070

7171
InstanceSet trainset = new InstanceSet(pp,af);
72+
7273
InstanceSet testset = new InstanceSet(pp,af);
7374

7475
//用不同的Reader读取相应格式的文件
7576
Reader reader = new DocumentReader(trainDataPath);
7677

7778
//读入数据,并进行数据处理
7879
trainset.loadThruStagePipes(reader);
79-
80+
// af.setStopIncrement(true);
8081
reader = new DocumentReader(testDataPath);
8182

8283
testset.loadThruStagePipes(reader);

0 commit comments

Comments
 (0)