Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion data/edu/stanford/nlp/process/ptblexer.gold
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,8 @@ by
%
.
<p>
C'mon
C'm
on
Fred
,
you
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ public class CoreMapExpressionExtractor<T extends MatchedExpression> {
private boolean keepTags = false;
/* Collapses extraction rules - use with care */
private final boolean collapseExtractionRules;
private final boolean keepNestedMatches;
private final boolean keepOverlappingMatches;
private final Class<CoreAnnotation<List<? extends CoreMap>>> tokensAnnotationKey;
private final Map<Integer, Stage<T>> stages;

Expand Down Expand Up @@ -136,11 +138,16 @@ public CoreMapExpressionExtractor(Env env) {
this.tokensAnnotationKey = EnvLookup.getDefaultTokensAnnotationKey(env);
if (env != null) {
this.collapseExtractionRules = Objects.equals((Boolean) env.get("collapseExtractionRules"), true);
this.keepNestedMatches = Objects.equals((Boolean) env.get("keepNestedMatches"), true);
this.keepOverlappingMatches = Objects.equals((Boolean) env.get("keepOverlappingMatches"), true);

if (env.get("verbose") != null)
verbose = (env.get("verbose") != null) &&
Objects.equals((Boolean) env.get("verbose"), true);
} else {
this.collapseExtractionRules = false;
this.keepNestedMatches = false;
this.keepOverlappingMatches = false;
}
}

Expand Down Expand Up @@ -433,13 +440,13 @@ private Pair<List<? extends CoreMap>, List<T>> applyCompositeRule(
annotateExpressions(merged, newExprs);
newExprs = MatchedExpression.removeNullValues(newExprs);
if ( ! newExprs.isEmpty()) {
newExprs = MatchedExpression.removeNested(newExprs);
newExprs = MatchedExpression.removeOverlapping(newExprs);
newExprs = this.keepNestedMatches ? newExprs : MatchedExpression.removeNested(newExprs);
newExprs = this.keepOverlappingMatches ? newExprs : MatchedExpression.removeOverlapping(newExprs);
merged = MatchedExpression.replaceMerged(merged, newExprs);
// Favor newly matched expressions over older ones
newExprs.addAll(matchedExpressions);
matchedExpressions = MatchedExpression.removeNested(newExprs);
matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
matchedExpressions = this.keepNestedMatches ? newExprs : MatchedExpression.removeNested(newExprs);
matchedExpressions = this.keepOverlappingMatches ? matchedExpressions : MatchedExpression.removeOverlapping(matchedExpressions);
} else {
extracted = false;
}
Expand Down Expand Up @@ -486,8 +493,8 @@ public List<T> extractExpressions(CoreMap annotation) {
}
annotateExpressions(annotation, matchedExpressions);
matchedExpressions = MatchedExpression.removeNullValues(matchedExpressions);
matchedExpressions = MatchedExpression.removeNested(matchedExpressions);
matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
matchedExpressions = this.keepNestedMatches ? matchedExpressions : MatchedExpression.removeNested(matchedExpressions);
matchedExpressions = this.keepOverlappingMatches ? matchedExpressions : MatchedExpression.removeOverlapping(matchedExpressions);
}

List<? extends CoreMap> merged = MatchedExpression.replaceMergedUsingTokenOffsets(annotation.get(tokensAnnotationKey), matchedExpressions);
Expand Down
4 changes: 4 additions & 0 deletions src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ public String getText() {
return text;
}

public String getName() {
return this.extractFunc.name;
}

public CoreMap getAnnotation() {
return annotation;
}
Expand Down
10 changes: 7 additions & 3 deletions src/edu/stanford/nlp/parser/metrics/EvaluateTreebank.java
Original file line number Diff line number Diff line change
Expand Up @@ -681,8 +681,10 @@ public double testOnTreebank(List<Pair<ParserQuery, Tree>> testTreebank) {
}

public double testOnTreebank(EvaluationDataset testTreebank) {
log.info("Testing on treebank");
Timing treebankTotalTimer = new Timing();
if (summary || !op.testOptions.quietEvaluation) {
log.info("Testing on treebank");
}
Timing treebankTotalTimer = (summary || !op.testOptions.quietEvaluation) ? new Timing() : null;
TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
TreebankLangParserParams tlpParams = op.tlpParams;
TreebankLanguagePack tlp = op.langpack();
Expand Down Expand Up @@ -729,7 +731,9 @@ public double testOnTreebank(EvaluationDataset testTreebank) {
}

//Done parsing...print the results of the evaluations
treebankTotalTimer.done("Testing on treebank");
if (treebankTotalTimer != null) {
treebankTotalTimer.done("Testing on treebank");
}
if (op.testOptions.quietEvaluation) {
pwErr = tlpParams.pw(System.err);
}
Expand Down
4 changes: 2 additions & 2 deletions src/edu/stanford/nlp/process/PTBLexer.flex
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ SUBSUPNUM = [\u207A\u207B\u208A\u208B]?([\u2070\u00B9\u00B2\u00B3\u2074-\u2079]+
FRAC = ({DIGIT}{1,4}[- \u00A0])?{DIGIT}{1,4}(\\?\/|\u2044){DIGIT}{1,3}(,{DIGIT}{3}|{DIGIT})?
FRAC2 = [\u00BC\u00BD\u00BE\u2150-\u215E\u2189]
/* # is here for historical reasons -- old UK ASCII-equivalent used # for pound mark. Bit ugly now. Allow $$$ */
DOLSIGN = ([A-Z]*\$|#|\$\$\$)
DOLSIGN = ([A-Z]*\$|#|\$\$+)
/* Currency: These are cent, pound, currency, yen; CP1252 euro; ECU and many other currency simples including Euro;
armenian dram, afghani, bengali rupee, thai bhat; full-wdith dollar, cent pound, yen, won */
DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9\u0E3F\u17DB\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]
Expand Down Expand Up @@ -804,7 +804,7 @@ ASSIMILATIONS3 = cannot|'twas|dunno|['’]d['’]ve
/* Assimilations2 leave 2 chars behind after division */
/* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
/* Include splitting some apostrophe-less negations, but not ones like "wont" that are also words. */
ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre
ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre|c{APOS}mon

/* CP1252: dagger, double dagger, per mille, bullet, small tilde, trademark */
CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
Expand Down
Loading