@@ -19,14 +19,14 @@ public class WordToSentenceProcessorTest extends TestCase {
19
19
20
20
private static final Annotator ud = new TokenizerAnnotator (false , "en" );
21
21
private static final Annotator udNL = new TokenizerAnnotator (false , "en" , "invertible,tokenizeNLs=true" );
22
- private static final Annotator wsNL = new TokenizerAnnotator ( false ,
23
- PropertiesUtils .asProperties ("tokenize.whitespace" , "true" , "invertible" , "true" , "tokenizeNLs" , "true" ));
22
+ private static final Annotator wsNL =
23
+ new TokenizerAnnotator ( false , PropertiesUtils .asProperties ("tokenize.whitespace" , "true" , "invertible" , "true" , "tokenizeNLs" , "true" ));
24
24
25
25
private static final WordToSentenceProcessor <CoreLabel > wts = new WordToSentenceProcessor <>();
26
26
private static final WordToSentenceProcessor <CoreLabel > wtsNull =
27
- new WordToSentenceProcessor <>(true ); // treat input as one sentence
28
- private static final WordToSentenceProcessor <CoreLabel > cwts = new WordToSentenceProcessor <>(
29
- "[.。]|[!?!?]+" , WordToSentenceProcessor .NewlineIsSentenceBreak .TWO_CONSECUTIVE , false );
27
+ new WordToSentenceProcessor <>(true ); // treat input as one sentence
28
+ private static final WordToSentenceProcessor <CoreLabel > cwts =
29
+ new WordToSentenceProcessor <>( "[.。]|[!?!?]+" , WordToSentenceProcessor .NewlineIsSentenceBreak .TWO_CONSECUTIVE , false );
30
30
31
31
32
32
private static void checkResult (WordToSentenceProcessor <CoreLabel > wts ,
@@ -104,103 +104,103 @@ public void testMr() {
104
104
105
105
public void testNullSplitter () {
106
106
checkResult (wtsNull , "This should be one sentence. There is no split." ,
107
- "This should be one sentence. There is no split." );
107
+ "This should be one sentence. There is no split." );
108
108
}
109
109
110
110
public void testParagraphStrategies () {
111
111
final WordToSentenceProcessor <CoreLabel > wtsNever =
112
- new WordToSentenceProcessor <>(WordToSentenceProcessor .NewlineIsSentenceBreak .NEVER );
112
+ new WordToSentenceProcessor <>(WordToSentenceProcessor .NewlineIsSentenceBreak .NEVER );
113
113
final WordToSentenceProcessor <CoreLabel > wtsAlways =
114
- new WordToSentenceProcessor <>(WordToSentenceProcessor .NewlineIsSentenceBreak .ALWAYS );
114
+ new WordToSentenceProcessor <>(WordToSentenceProcessor .NewlineIsSentenceBreak .ALWAYS );
115
115
final WordToSentenceProcessor <CoreLabel > wtsTwo =
116
- new WordToSentenceProcessor <>(WordToSentenceProcessor .NewlineIsSentenceBreak .TWO_CONSECUTIVE );
116
+ new WordToSentenceProcessor <>(WordToSentenceProcessor .NewlineIsSentenceBreak .TWO_CONSECUTIVE );
117
117
118
118
String input1 = "Depending on the options,\n this could be all sorts of things,\n \n as I like chocolate. And cookies." ;
119
119
String input2 = "Depending on the options,\n this could be all sorts of things,\n as I like chocolate. And cookies." ;
120
120
checkResult (wtsNever , input1 ,
121
- "Depending on the options,\n this could be all sorts of things,\n \n as I like chocolate." ,
122
- "And cookies." );
121
+ "Depending on the options,\n this could be all sorts of things,\n \n as I like chocolate." ,
122
+ "And cookies." );
123
123
checkResult (wtsAlways , input1 ,
124
- "Depending on the options," ,
125
- "this could be all sorts of things," ,
126
- "as I like chocolate." ,
127
- "And cookies." );
124
+ "Depending on the options," ,
125
+ "this could be all sorts of things," ,
126
+ "as I like chocolate." ,
127
+ "And cookies." );
128
128
checkResult (wtsTwo , input1 ,
129
- "Depending on the options, this could be all sorts of things," ,
130
- "as I like chocolate." ,
131
- "And cookies." );
129
+ "Depending on the options, this could be all sorts of things," ,
130
+ "as I like chocolate." ,
131
+ "And cookies." );
132
132
checkResult (wtsNever , input2 ,
133
- "Depending on the options,\n this could be all sorts of things,\n as I like chocolate." ,
134
- "And cookies." );
133
+ "Depending on the options,\n this could be all sorts of things,\n as I like chocolate." ,
134
+ "And cookies." );
135
135
checkResult (wtsAlways , input2 ,
136
- "Depending on the options," ,
137
- "this could be all sorts of things," ,
138
- "as I like chocolate." ,
139
- "And cookies." );
136
+ "Depending on the options," ,
137
+ "this could be all sorts of things," ,
138
+ "as I like chocolate." ,
139
+ "And cookies." );
140
140
checkResult (wtsTwo , input2 ,
141
- "Depending on the options,\n this could be all sorts of things,\n as I like chocolate." ,
142
- "And cookies." );
141
+ "Depending on the options,\n this could be all sorts of things,\n as I like chocolate." ,
142
+ "And cookies." );
143
143
String input3 = "Specific descriptions are absent.\n \n ''Mossy Head Industrial Park'' it says." ;
144
144
checkResult (wtsTwo , input3 ,
145
- "Specific descriptions are absent." ,
146
- "''Mossy Head Industrial Park'' it says." );
145
+ "Specific descriptions are absent." ,
146
+ "''Mossy Head Industrial Park'' it says." );
147
147
}
148
148
149
149
public void testXmlElements () {
150
150
final WordToSentenceProcessor <CoreLabel > wtsXml =
151
- new WordToSentenceProcessor <>(null , null ,null ,
152
- Generics .newHashSet (Arrays .asList ("p" , "chapter" )),
153
- WordToSentenceProcessor .NewlineIsSentenceBreak .NEVER , null , null );
151
+ new WordToSentenceProcessor <>(null , null ,null ,
152
+ Generics .newHashSet (Arrays .asList ("p" , "chapter" )),
153
+ WordToSentenceProcessor .NewlineIsSentenceBreak .NEVER , null , null );
154
154
155
155
String input1 = "<chapter>Chapter 1</chapter><p>This is text. So is this.</p> <p>One without end</p><p>Another</p><p>And another</p>" ;
156
156
checkResult (wtsXml , input1 ,
157
- "Chapter 1" ,
158
- "This is text." ,
159
- "So is this." ,
160
- "One without end" ,
161
- "Another" ,
162
- "And another" );
157
+ "Chapter 1" ,
158
+ "This is text." ,
159
+ "So is this." ,
160
+ "One without end" ,
161
+ "Another" ,
162
+ "And another" );
163
163
}
164
164
165
165
public void testRegion () {
166
166
final WordToSentenceProcessor <CoreLabel > wtsRegion =
167
- new WordToSentenceProcessor <>(WordToSentenceProcessor .DEFAULT_BOUNDARY_REGEX ,
168
- WordToSentenceProcessor .DEFAULT_BOUNDARY_FOLLOWERS_REGEX ,
169
- WordToSentenceProcessor .DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD ,
170
- Generics .newHashSet (Collections .singletonList ("p" )),
171
- "chapter|preface" , WordToSentenceProcessor .NewlineIsSentenceBreak .NEVER , null , null , false , false );
167
+ new WordToSentenceProcessor <>(WordToSentenceProcessor .DEFAULT_BOUNDARY_REGEX ,
168
+ WordToSentenceProcessor .DEFAULT_BOUNDARY_FOLLOWERS_REGEX ,
169
+ WordToSentenceProcessor .DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD ,
170
+ Generics .newHashSet (Collections .singletonList ("p" )),
171
+ "chapter|preface" , WordToSentenceProcessor .NewlineIsSentenceBreak .NEVER , null , null , false , false );
172
172
String input1 = "<title>Chris rules!</title><preface><p>Para one</p><p>Para two</p></preface>" +
173
- "<chapter><p>Text we like. Two sentences \n \n in it.</p></chapter><coda>Some more text here</coda>" ;
173
+ "<chapter><p>Text we like. Two sentences \n \n in it.</p></chapter><coda>Some more text here</coda>" ;
174
174
checkResult (wtsRegion , input1 ,
175
- "Para one" ,
176
- "Para two" ,
177
- "Text we like." ,
178
- "Two sentences in it." );
175
+ "Para one" ,
176
+ "Para two" ,
177
+ "Text we like." ,
178
+ "Two sentences in it." );
179
179
180
180
}
181
181
182
182
public void testBlankLines () {
183
183
final WordToSentenceProcessor <CoreLabel > wtsLines =
184
- new WordToSentenceProcessor <>(Generics .newHashSet (WordToSentenceProcessor .DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD ));
184
+ new WordToSentenceProcessor <>(Generics .newHashSet (WordToSentenceProcessor .DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD ));
185
185
String input1 = "Depending on the options,\n this could be all sorts of things,\n \n as I like chocolate. And cookies." ;
186
186
checkResult (wtsLines , input1 ,
187
- "Depending on the options," ,
188
- "this could be all sorts of things," ,
189
- "" ,
190
- "as I like chocolate. And cookies." );
187
+ "Depending on the options," ,
188
+ "this could be all sorts of things," ,
189
+ "" ,
190
+ "as I like chocolate. And cookies." );
191
191
String input2 = "Depending on the options,\n this could be all sorts of things,\n \n as I like chocolate. And cookies.\n " ;
192
192
checkResult (wtsLines , input2 ,
193
- "Depending on the options," ,
194
- "this could be all sorts of things," ,
195
- "" ,
196
- "as I like chocolate. And cookies." );
193
+ "Depending on the options," ,
194
+ "this could be all sorts of things," ,
195
+ "" ,
196
+ "as I like chocolate. And cookies." );
197
197
String input3 = "Depending on the options,\n this could be all sorts of things,\n \n as I like chocolate. And cookies.\n \n " ;
198
198
checkResult (wtsLines , input3 ,
199
- "Depending on the options," ,
200
- "this could be all sorts of things," ,
201
- "" ,
202
- "as I like chocolate. And cookies." ,
203
- "" );
199
+ "Depending on the options," ,
200
+ "this could be all sorts of things," ,
201
+ "" ,
202
+ "as I like chocolate. And cookies." ,
203
+ "" );
204
204
}
205
205
206
206
public void testExclamationPoint () {
@@ -225,10 +225,10 @@ public void testChinese() {
225
225
*/
226
226
public void testParagraphSeparator () {
227
227
checkResult (wts , "Hello\u2029 World." ,
228
- "Hello" , "World." );
228
+ "Hello" , "World." );
229
229
checkResult (wts , "Hello.\u2029 World." ,
230
- "Hello." , "World." );
230
+ "Hello." , "World." );
231
231
checkResult (wts , "Hello \u2029 World." ,
232
- "Hello" , "World." );
232
+ "Hello" , "World." );
233
233
}
234
234
}
0 commit comments