Skip to content

Commit a8c816e

Browse files
committed
Merge branch 'master' of https://github.com/xpqiu/fnlp.git
2 parents cb7d951 + 8224709 commit a8c816e

File tree

2 files changed

+43
-42
lines changed

2 files changed

+43
-42
lines changed

fnlp-app/src/main/java/org/fnlp/app/lucene/SentenceTokenizer.java

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
1-
/**
2-
* This file is part of FNLP (formerly FudanNLP).
3-
*
4-
* FNLP is free software: you can redistribute it and/or modify
5-
* it under the terms of the GNU Lesser General Public License as published by
6-
* the Free Software Foundation, either version 3 of the License, or
7-
* (at your option) any later version.
8-
*
9-
* FNLP is distributed in the hope that it will be useful,
10-
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11-
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12-
* GNU Lesser General Public License for more details.
13-
*
14-
* You should have received a copy of the GNU General Public License
15-
* along with FudanNLP. If not, see <http://www.gnu.org/licenses/>.
16-
*
17-
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
18-
*/
19-
1+
/**
2+
* This file is part of FNLP (formerly FudanNLP).
3+
*
4+
* FNLP is free software: you can redistribute it and/or modify
5+
* it under the terms of the GNU Lesser General Public License as published by
6+
* the Free Software Foundation, either version 3 of the License, or
7+
* (at your option) any later version.
8+
*
9+
* FNLP is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU Lesser General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with FudanNLP. If not, see <http://www.gnu.org/licenses/>.
16+
*
17+
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
18+
*/
19+
2020
package org.fnlp.app.lucene;
2121

2222
import java.io.IOException;
@@ -38,7 +38,7 @@ public final class SentenceTokenizer extends Tokenizer {
3838
/**
3939
* Space-like characters that need to be skipped: such as space, tab, newline, carriage return.
4040
*/
41-
public static final String SPACES = "  \t\r\n";
41+
public static final String SPACES = " \t\r\n";
4242

4343
private final StringBuilder buffer = new StringBuilder();
4444

fnlp-app/src/main/java/org/fnlp/app/lucene/WordTokenFilter.java

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
1-
/**
2-
* This file is part of FNLP (formerly FudanNLP).
3-
*
4-
* FNLP is free software: you can redistribute it and/or modify
5-
* it under the terms of the GNU Lesser General Public License as published by
6-
* the Free Software Foundation, either version 3 of the License, or
7-
* (at your option) any later version.
8-
*
9-
* FNLP is distributed in the hope that it will be useful,
10-
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11-
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12-
* GNU Lesser General Public License for more details.
13-
*
14-
* You should have received a copy of the GNU General Public License
15-
* along with FudanNLP. If not, see <http://www.gnu.org/licenses/>.
16-
*
17-
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
18-
*/
19-
1+
/**
2+
* This file is part of FNLP (formerly FudanNLP).
3+
*
4+
* FNLP is free software: you can redistribute it and/or modify
5+
* it under the terms of the GNU Lesser General Public License as published by
6+
* the Free Software Foundation, either version 3 of the License, or
7+
* (at your option) any later version.
8+
*
9+
* FNLP is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
* GNU Lesser General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU General Public License
15+
* along with FudanNLP. If not, see <http://www.gnu.org/licenses/>.
16+
*
17+
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
18+
*/
19+
2020
package org.fnlp.app.lucene;
2121

2222
import java.io.IOException;
@@ -81,8 +81,9 @@ public boolean incrementToken() throws IOException {
8181
posBuffer = Arrays.asList(p);
8282
tokenIter = tokenBuffer.iterator();
8383
posIter = posBuffer.iterator();
84-
idx = 0;
85-
/*
84+
// idx = 0;
85+
idx = tokStart;
86+
/*
8687
* it should not be possible to have a sentence with 0 words, check just in case.
8788
* returning EOS isn't the best either, but its the behavior of the original code.
8889
*/
@@ -103,7 +104,7 @@ public boolean incrementToken() throws IOException {
103104
if (hasIllegalOffsets) {
104105
offsetAtt.setOffset(tokStart, tokEnd);
105106
} else {
106-
offsetAtt.setOffset(idx, end-1);
107+
offsetAtt.setOffset(idx, end);
107108
}
108109
idx = end;
109110
typeAtt.setType("word");

0 commit comments

Comments
 (0)