Skip to content

Commit 19d8b0a

Browse files
authored
Merge pull request #3 from manzurola/0.2.0
0.2.0
2 parents 4ba6c1f + 56b0c7b commit 19d8b0a

21 files changed

+572
-431
lines changed

README.md

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Add this to the dependencies section of your `pom.xml`:
2727
<dependency>
2828
<groupId>com.github.manzurola</groupId>
2929
<artifactId>errgent</artifactId>
30-
<version>0.1.0</version>
30+
<version>0.2.0</version>
3131
</dependency>
3232
```
3333

@@ -36,23 +36,31 @@ Add this to the dependencies section of your `pom.xml`:
3636
To use Errgent in code, follow these steps:
3737

3838
```java
39-
// Get a spaCy instance
40-
SpaCy spacy = SpaCy.create(CoreNLPAdapter.create());
41-
42-
// Create an English error annotator
43-
Annotator annotator = Errant.newAnnotator("en", spacy);
44-
45-
// Create an English error generator
46-
Generator generator = Errgent.newGenerator("en", annotator);
47-
48-
// parse the doc (a utilty method)
49-
Doc target = generator.parse("My friends like to have fun.");
50-
51-
// Generate all documents that contain the specified error
52-
// (will contain "My friends like to has fun.")
53-
List<Doc> inflections = generator.generate(target, REPLACEMENT_SUBJECT_VERB_AGREEMENT);
54-
for (Doc inflection : inflections) {
55-
System.out.println(inflection.text());
39+
// Create a spacy instance (from spaCy4j)
40+
SpaCy spacy = SpaCy.create(CoreNLPAdapter.forEnglish());
41+
42+
// Instantiate a new Errgent for English
43+
Generator errgent = Errgent.forEnglish(spacy);
44+
45+
// Generate a specific grammatical error in the target doc. Since a
46+
// sentence can contain multiple errors at once, all such possible
47+
// errors are returned.
48+
List<GeneratedError> generatedErrors = errgent.generateErrors(
49+
"If I were you, I would go home.",
50+
GrammaticalError.REPLACEMENT_SUBJECT_VERB_AGREEMENT
51+
);
52+
53+
// Print out the results. The markedText() method retrieves the
54+
// erroneous text with the error marked by an asterisk on both sides.
55+
// We can also access the char offsets of the error using charStart
56+
// and charEnd methods of GeneratedError.
57+
for (GeneratedError generatedError : generatedErrors) {
58+
String text = generatedError.markedText();
59+
System.out.printf(
60+
"%s, %s%n",
61+
text,
62+
generatedError.error()
63+
);
5664
}
5765
```
5866

pom.xml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,23 @@
66

77
<groupId>com.github.manzurola</groupId>
88
<artifactId>errgent</artifactId>
9-
<version>0.1.0</version>
9+
<version>0.2.0</version>
1010

1111
<properties>
1212
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
1313
<maven.compiler.source>11</maven.compiler.source>
1414
<maven.compiler.target>11</maven.compiler.target>
1515
<junit-jupiter.version>5.5.2</junit-jupiter.version>
1616
<logback.version>1.2.0</logback.version>
17-
<errant4j.version>0.3.0</errant4j.version>
18-
<spacy4j.version>0.2.0</spacy4j.version>
17+
<errant4j.version>0.4.0</errant4j.version>
18+
<spacy4j.version>0.3.0</spacy4j.version>
1919
<simplenlg.version>4.5.0</simplenlg.version>
2020
</properties>
2121

2222
<distributionManagement>
2323
<repository>
2424
<id>github</id>
25-
<name>errant4j</name>
25+
<name>errgent</name>
2626
<url>https://maven.pkg.github.com/manzurola/errgent</url>
2727
</repository>
2828
</distributionManagement>
Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,16 @@
11
package com.github.manzurola.errgent.core;
22

33
import com.github.manzurola.errant4j.core.Annotator;
4-
import com.github.manzurola.errgent.lang.en.inflector.EnInflector;
4+
import com.github.manzurola.errant4j.core.Errant;
5+
import com.github.manzurola.errgent.lang.en.EnInflector;
6+
import com.github.manzurola.spacy4j.api.SpaCy;
57

6-
import java.util.Map;
7-
import java.util.function.Function;
8+
public interface Errgent {
89

9-
public final class Errgent {
10-
11-
private static final Map<String, Function<Annotator, Generator>> generators;
12-
13-
static {
14-
generators = Map.of(
15-
"en", annotator -> new GeneratorImpl(annotator, new EnInflector())
10+
static Generator forEnglish(SpaCy spaCy) {
11+
Annotator annotator = Errant.forEnglish(spaCy);
12+
return new GeneratorImpl(
13+
new EnInflector(), annotator
1614
);
1715
}
18-
19-
private Errgent() {
20-
}
21-
22-
public static Generator newGenerator(String language, Annotator annotator) {
23-
if (generators.containsKey(language)) {
24-
return generators.get(language).apply(annotator);
25-
} else {
26-
throw new IllegalArgumentException(String.format("Unsupported Errgent language %s", language));
27-
}
28-
}
2916
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package com.github.manzurola.errgent.core;
2+
3+
import com.github.manzurola.errant4j.core.errors.GrammaticalError;
4+
5+
import java.util.Objects;
6+
import java.util.function.Function;
7+
8+
public final class GeneratedError {
9+
10+
private final String text;
11+
private final int charStart;
12+
private final int charEnd;
13+
private final GrammaticalError grammaticalError;
14+
15+
public GeneratedError(
16+
String text,
17+
int charStart,
18+
int charEnd,
19+
GrammaticalError grammaticalError
20+
) {
21+
this.text = Objects.requireNonNull(text);
22+
Objects.checkFromToIndex(charStart, charEnd, text.length());
23+
this.charStart = charStart;
24+
this.charEnd = charEnd;
25+
this.grammaticalError = Objects.requireNonNull(grammaticalError);
26+
}
27+
28+
public final String text() {
29+
return text;
30+
}
31+
32+
public final int charStart() {
33+
return charStart;
34+
}
35+
36+
public final int charEnd() {
37+
return charEnd;
38+
}
39+
40+
public final GrammaticalError error() {
41+
return grammaticalError;
42+
}
43+
44+
public final String markedText() {
45+
return markedText(s -> "*" + s + "*");
46+
}
47+
48+
public final String markedText(Function<String, String> errorDecorator) {
49+
String decoratedMistake = errorDecorator.apply(errorSpan());
50+
return new StringBuilder(text)
51+
.replace(charStart, charEnd, decoratedMistake)
52+
.toString();
53+
}
54+
55+
/**
56+
* Get the text span that is marked as error. A utility method that replaces
57+
* text.substring(charStart, charEnd).
58+
*/
59+
public final String errorSpan() {
60+
return text.substring(charStart, charEnd);
61+
}
62+
63+
@Override
64+
public final boolean equals(Object o) {
65+
if (this == o) {
66+
return true;
67+
}
68+
if (o == null || getClass() != o.getClass()) {
69+
return false;
70+
}
71+
GeneratedError that = (GeneratedError) o;
72+
return charStart == that.charStart &&
73+
charEnd == that.charEnd &&
74+
text.equals(that.text) &&
75+
grammaticalError == that.grammaticalError;
76+
}
77+
78+
@Override
79+
public final int hashCode() {
80+
return Objects.hash(charStart, charEnd, text, grammaticalError);
81+
}
82+
83+
@Override
84+
public final String toString() {
85+
return "[" +
86+
text +
87+
", " +
88+
error() +
89+
" at " +
90+
charStart +
91+
", " +
92+
charEnd +
93+
" ]";
94+
}
95+
96+
}
Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package com.github.manzurola.errgent.core;
22

3-
import com.github.manzurola.errant4j.core.GrammaticalError;
3+
import com.github.manzurola.errant4j.core.errors.GrammaticalError;
44
import com.github.manzurola.spacy4j.api.containers.Doc;
55

66
import java.util.List;
@@ -11,20 +11,30 @@
1111
public interface Generator {
1212

1313
/**
14-
* Utility method to apply NLP to a text using the underlying spacy instance.
14+
* Generate inflected docs with the specified grammatical error. A utility
15+
* method that filters results from {@link Generator#generateErrors(String)}
16+
* that contain the specified error
1517
*
16-
* @param text the text to parse
17-
* @return a parsed Doc object
18+
* @param sourceText the target from which grammatically incorrect variances
19+
* will be produced.
20+
* @return a list of {@link Doc} objects containing the specified
21+
* grammatical error. Returns an empty list if no matching errors could be
22+
* produced.
1823
*/
19-
Doc parse(String text);
24+
List<GeneratedError> generateErrors(
25+
String sourceText,
26+
GrammaticalError error
27+
);
2028

2129
/**
22-
* Generate inflected docs with the specified grammatical error.
30+
* Generate inflected docs with all possible grammatical errors.
2331
*
24-
* @param target the target from which grammatically incorrect variances will be produced.
25-
* @return a list of {@link Doc} objects containing the specified grammatical error. Returns an empty list if no
26-
* matching errors could be produced.
32+
* @param sourceText the target from which grammatically incorrect variances
33+
* will be produced.
34+
* @return a list of {@link Doc} objects containing the specified
35+
* grammatical error. Returns an empty list if no matching errors could be
36+
* produced.
2737
*/
28-
List<Doc> generate(Doc target, GrammaticalError error);
38+
List<GeneratedError> generateErrors(String sourceText);
2939

3040
}
Lines changed: 61 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,88 @@
11
package com.github.manzurola.errgent.core;
22

3+
import com.github.manzurola.errant4j.core.Annotation;
34
import com.github.manzurola.errant4j.core.Annotator;
4-
import com.github.manzurola.errant4j.core.GrammaticalError;
5-
import com.github.manzurola.errgent.core.inflect.Inflection;
6-
import com.github.manzurola.errgent.core.inflect.InflectionFactory;
7-
import com.github.manzurola.errgent.core.inflect.Inflector;
5+
import com.github.manzurola.errant4j.core.errors.GrammaticalError;
6+
import com.github.manzurola.errgent.core.inflection.Inflector;
87
import com.github.manzurola.spacy4j.api.containers.Doc;
8+
import com.github.manzurola.spacy4j.api.containers.Span;
99
import org.slf4j.Logger;
1010
import org.slf4j.LoggerFactory;
1111

1212
import java.util.List;
13-
import java.util.function.Predicate;
13+
import java.util.Optional;
1414
import java.util.stream.Collectors;
1515

1616
public final class GeneratorImpl implements Generator {
1717
private final Logger logger = LoggerFactory.getLogger(this.getClass());
18-
private final Annotator annotator;
1918
private final Inflector inflector;
19+
private final Annotator annotator;
2020

21-
public GeneratorImpl(Annotator annotator, Inflector inflector) {
21+
public GeneratorImpl(Inflector inflector, Annotator annotator) {
2222
this.annotator = annotator;
2323
this.inflector = inflector;
24+
logger.info("Loaded grammatical error generator");
2425
}
2526

2627
@Override
27-
public final Doc parse(String text) {
28-
return annotator.parse(text);
28+
public List<GeneratedError> generateErrors(
29+
String sourceText, GrammaticalError error
30+
) {
31+
return generateErrors(sourceText)
32+
.stream()
33+
.filter(generatedError -> error.equals(generatedError.error()))
34+
.collect(Collectors.toList());
2935
}
3036

3137
@Override
32-
public List<Doc> generate(Doc target, GrammaticalError error) {
33-
InflectionFactory inflectionFactory = new InflectionFactory(annotator, target);
34-
return target
35-
.stream()
36-
.parallel()
37-
.flatMap(token -> inflector.inflect(token, inflectionFactory))
38-
.filter(filter(List.of(error)))
39-
.map(Inflection::doc)
40-
.collect(Collectors.toList());
38+
public final List<GeneratedError> generateErrors(String sourceText) {
39+
final Doc sourceDoc = annotator.parse(sourceText);
40+
return sourceDoc
41+
.tokens()
42+
.stream()
43+
.flatMap(inflector::inflectToken)
44+
.distinct()
45+
.parallel()
46+
.map(inflection -> inflection.applyTo(sourceDoc.text(), annotator))
47+
.map(inflectedDoc -> annotateSingleError(inflectedDoc, sourceDoc))
48+
.filter(Optional::isPresent)
49+
.map(Optional::get)
50+
.collect(Collectors.toList());
51+
}
52+
53+
private Optional<GeneratedError> annotateSingleError(
54+
final Doc inflectedDoc,
55+
final Doc originalDoc
56+
) {
57+
List<Annotation> annotations = annotator
58+
.annotate(
59+
inflectedDoc.tokens(),
60+
originalDoc.tokens()
61+
)
62+
.stream()
63+
.filter(annotation -> !annotation.error().isNone())
64+
.collect(Collectors.toList());
65+
66+
if (annotations.size() > 1) {
67+
return Optional.empty();
68+
}
69+
70+
return Optional.of(markError(annotations.get(0), inflectedDoc));
4171
}
4272

43-
private Predicate<Inflection> filter(List<GrammaticalError> errors) {
44-
return inflection -> inflection.errors()
45-
.stream()
46-
.anyMatch(annotation -> errors.contains(annotation.grammaticalError()));
73+
private GeneratedError markError(Annotation annotation, Doc inflectedDoc) {
74+
String generatedText = inflectedDoc.text();
75+
Span span = inflectedDoc.spanOf(
76+
annotation.sourcePosition(),
77+
annotation.sourcePosition() +
78+
annotation.sourceTokens().size()
79+
);
80+
int charStart = span.startChar();
81+
int charEnd = span.endChar();
82+
GrammaticalError grammaticalError = annotation.error();
83+
return new GeneratedError(
84+
generatedText, charStart, charEnd, grammaticalError
85+
);
4786
}
4887

4988
}

0 commit comments

Comments
 (0)