Skip to content

Commit 3cf716e

Browse files
aitelintLionbridgeCS2
andauthored
[.NET] Japanese CJK configuration + DateExtractor refinements (#2575)
* Japanese CJK configuration + DateExtractor support * Fixed named group in JavaScript * Localized example comments in Korean config files * Removed duplicate test cases Co-authored-by: LionbridgeCS2 <v-Fabrizio.Sorba@lionbridge.com>
1 parent 9778199 commit 3cf716e

66 files changed

Lines changed: 2927 additions & 6726 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.NET/Microsoft.Recognizers.Definitions.Common/Chinese/DateTimeDefinitions.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ public static class DateTimeDefinitions
4141
public static readonly string DateThisRegex = $@"(这个|这一个|这|这一|本){WeekDayRegex}";
4242
public static readonly string DateLastRegex = $@"(上一个|上个|上一|上|最后一个|最后)(的)?{WeekDayRegex}";
4343
public static readonly string DateNextRegex = $@"(下一个|下个|下一|下)(的)?{WeekDayRegex}";
44+
public const string SpecialMonthRegex = @"^[.]";
45+
public const string SpecialYearRegex = @"^[.]";
4446
public const string SpecialDayRegex = @"(最近|前天|后天|昨天|明天|今天|今日|明日|昨日|大后天|大前天|後天|大後天)";
4547
public const string SpecialDayWithNumRegex = @"^[.]";
4648
public static readonly string WeekDayOfMonthRegex = $@"((({MonthRegex}|{MonthNumRegex})的\s*)(?<cardinal>第一个|第二个|第三个|第四个|第五个|最后一个)\s*{WeekDayRegex})";
@@ -75,6 +77,7 @@ public static class DateTimeDefinitions
7577
public static readonly string MonthSuffixRegex = $@"(?<msuf>({RelativeMonthRegex}|{MonthRegex}))";
7678
public static readonly string SimpleCasesRegex = $@"((从)\s*)?(({YearRegex}|{DatePeriodYearInCJKRegex})\s*)?{MonthSuffixRegex}({DatePeriodDayRegexInCJK}|{DayRegex})\s*{DatePeriodTillRegex}\s*({DatePeriodDayRegexInCJK}|{DayRegex})((\s+|\s*,\s*){YearRegex})?";
7779
public static readonly string YearAndMonth = $@"({DatePeriodYearInCJKRegex}|{YearRegex})\s*{MonthRegex}";
80+
public static readonly string SimpleYearAndMonth = $@"({YearNumRegex}[/\\\-]{MonthNumRegex}\b$)";
7881
public static readonly string PureNumYearAndMonth = $@"({YearRegexInNumber}\s*[-\.\/]\s*{MonthNumRegex})|({MonthNumRegex}\s*\/\s*{YearRegexInNumber})";
7982
public static readonly string OneWordPeriodRegex = $@"(((?<yearrel>(明|今|去)年)\s*)?{MonthRegex}|({DatePeriodThisRegex}|{DatePeriodLastRegex}|{DatePeriodNextRegex})(?<halfTag>半)?\s*(周末|周|月|年)|周末|(今|明|去|前|后)年(\s*{HalfYearRegex})?)";
8083
public static readonly string WeekOfMonthRegex = $@"(?<wom>{MonthSuffixRegex}的(?<cardinal>第一|第二|第三|第四|第五|最后一)\s*周\s*)";
@@ -86,6 +89,8 @@ public static class DateTimeDefinitions
8689
public static readonly string YearToYearSuffixRequired = $@"({DateRangePrepositions})({DatePeriodYearInCJKRegex}|{YearRegex})\s*({DatePeriodTillSuffixRequiredRegex})\s*({DatePeriodYearInCJKRegex}|{YearRegex})\s*(之间|之内|期间|中间|间)";
8790
public static readonly string MonthToMonth = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillRegex}({MonthRegex})";
8891
public static readonly string MonthToMonthSuffixRequired = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillSuffixRequiredRegex}({MonthRegex})\s*(之间|之内|期间|中间|间)";
92+
public const string DayToDay = @"^[.]";
93+
public const string DayRegexForPeriod = @"^[.]";
8994
public const string PastRegex = @"(?<past>(之前|前|上|近|过去))";
9095
public const string FutureRegex = @"(?<future>(之后|之後|后|後|(?<![一两几]\s*)下|未来(的)?))";
9196
public const string SeasonRegex = @"(?<season>春|夏|秋|冬)(天|季)?";

.NET/Microsoft.Recognizers.Definitions.Common/Japanese/DateTimeDefinitions.cs

Lines changed: 194 additions & 68 deletions
Large diffs are not rendered by default.

.NET/Microsoft.Recognizers.Definitions.Common/Japanese/NumbersDefinitions.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ public static class NumbersDefinitions
142142
public const string PointRegexStr = @"[\..・]";
143143
public static readonly string AllFloatRegex = $@"{NegativeNumberTermsRegex}?{AllIntRegex}\s*{PointRegexStr}\s*[一二三四五六七八九](\s*{ZeroToNineIntegerRegex})*";
144144
public static readonly string NumbersWithAllowListRegex = $@"(?<!(離は))({NegativeNumberTermsRegex}?({NotSingleRegex}|{SingleRegex})(?!({AllIntRegex}*([、.]{ZeroToNineIntegerRegex}+)*|{AllFloatRegex})*\s*{PercentageRegex}+))(?!(\s*{AllMultiplierLookupRegex}))";
145-
public static readonly string NumbersAggressiveRegex = $@"(({AllIntRegex})(?!({AllIntRegex}*([、.]{ZeroToNineIntegerRegex}+)*|{AllFloatRegex})*(\s*{PercentageRegex})?))";
145+
public static readonly string NumbersAggressiveRegex = $@"(({AllIntRegex})(?!({AllIntRegex}|([、.]{ZeroToNineIntegerRegex})|{AllFloatRegex}|\s*{PercentageRegex})))";
146146
public static readonly string PointRegex = $@"{PointRegexStr}";
147147
public static readonly string DoubleSpecialsChars = $@"((?<!({ZeroToNineFullHalfRegex}+[\..]{ZeroToNineFullHalfRegex}*))({NegativeNumberTermsRegexNum}\s*)?{ZeroToNineFullHalfRegex}+[\..,]{ZeroToNineFullHalfRegex}+(?!({ZeroToNineFullHalfRegex}*[\..,]{ZeroToNineFullHalfRegex}+)))(?=\b|\D)(?!\s*{AllMultiplierLookupRegex})";
148148
public static readonly string DoubleRoundNumberSpecialsChars = $@"(?<!(({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+[\..・,]({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})*))(({NegativeNumberTermsRegexNum}|{NegativeNumberTermsRegex})\s*)?({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+[\..・,]({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+(?!({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})*[\..・,]({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+)";

.NET/Microsoft.Recognizers.Definitions.Common/Korean/DateTimeDefinitions.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ public static class DateTimeDefinitions
4141
public static readonly string DateThisRegex = $@"(이번\s?주?)\s*{WeekDayRegex}";
4242
public static readonly string DateLastRegex = $@"((저번|지난)\s?주?)\s*{WeekDayRegex}";
4343
public static readonly string DateNextRegex = $@"(다음\s?주?)\s*{WeekDayRegex}";
44+
public const string SpecialMonthRegex = @"^[.]";
45+
public const string SpecialYearRegex = @"^[.]";
4446
public const string SpecialDayRegex = @"(최근|그저께|그제|((내일)?\s?모레)|그끄저께|어제|내일|오늘|금일|작일|익일|당일|명일|전일)";
4547
public const string SpecialDayWithNumRegex = @"^[.]";
4648
public static readonly string WeekDayOfMonthRegex = $@"((({MonthRegex}|{MonthNumRegex}(월|달))의?\s*)?(?<cardinal>첫\s?번?째|두\s?번째|둘째|세\s?번째|셋째|네\s?번째|넷째|다섯\s?번?째|다섯째|여섯\s?번?째|여섯째|마지막)\s*{WeekDayRegex})";
@@ -75,6 +77,7 @@ public static class DateTimeDefinitions
7577
public static readonly string MonthSuffixRegex = $@"(?<msuf>({RelativeMonthRegex}|{MonthRegex}))";
7678
public static readonly string SimpleCasesRegex = $@"((从)\s*)?(({YearRegex}|{DatePeriodYearInCJKRegex})\s*)?{MonthSuffixRegex}({DatePeriodDayRegexInCJK}|{DayRegex})\s*{DatePeriodTillRegex}\s*({DatePeriodDayRegexInCJK}|{DayRegex})((\s+|\s*,\s*){YearRegex})?";
7779
public static readonly string YearAndMonth = $@"({DatePeriodYearInCJKRegex}|{YearRegex})\s*{MonthRegex}";
80+
public static readonly string SimpleYearAndMonth = $@"({YearNumRegex}[/\\\-]{MonthNumRegex}\b$)";
7881
public static readonly string PureNumYearAndMonth = $@"({YearRegexInNumber}\s*[-\.\/]\s*{MonthNumRegex})|({MonthNumRegex}\s*\/\s*{YearRegexInNumber})";
7982
public static readonly string OneWordPeriodRegex = $@"(((?<yearrel>(明|今|去)年)\s*)?{MonthRegex}|({DatePeriodThisRegex}|{DatePeriodLastRegex}|{DatePeriodNextRegex})(?<halfTag>半)?\s*(周末|周|月|年)|周末|(今|明|去|前|后)年(\s*{HalfYearRegex})?)";
8083
public static readonly string WeekOfMonthRegex = $@"(?<wom>{MonthSuffixRegex}的(?<cardinal>첫\s?번?째|두번째|둘째|세번째|셋째|네번째|넷째|마지막)\s*주\s*)";
@@ -86,6 +89,8 @@ public static class DateTimeDefinitions
8689
public static readonly string YearToYearSuffixRequired = $@"({DateRangePrepositions})({DatePeriodYearInCJKRegex}|{YearRegex})\s*({DatePeriodTillSuffixRequiredRegex})\s*({DatePeriodYearInCJKRegex}|{YearRegex})\s*(之间|之内|期间|中间|间)";
8790
public static readonly string MonthToMonth = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillRegex}({MonthRegex})";
8891
public static readonly string MonthToMonthSuffixRequired = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillSuffixRequiredRegex}({MonthRegex})\s*(之间|之内|期间|中间|间)";
92+
public const string DayToDay = @"^[.]";
93+
public const string DayRegexForPeriod = @"^[.]";
8994
public const string PastRegex = @"(?<past>(之前|前|上|近|过去))";
9095
public const string FutureRegex = @"(?<future>(之后|之後|后|後|(?<![一两几]\s*)下|未来(的)?))";
9196
public const string SeasonRegex = @"(?<season>春|夏|秋|冬)(天|季)?";

.NET/Microsoft.Recognizers.Text.DataDrivenTests/TestHelpers.cs

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
using Microsoft.Recognizers.Text.DateTime.German;
1414
using Microsoft.Recognizers.Text.DateTime.Hindi;
1515
using Microsoft.Recognizers.Text.DateTime.Italian;
16+
using Microsoft.Recognizers.Text.DateTime.Japanese;
1617
using Microsoft.Recognizers.Text.DateTime.Korean;
1718
using Microsoft.Recognizers.Text.DateTime.Portuguese;
1819
using Microsoft.Recognizers.Text.DateTime.Spanish;
@@ -569,59 +570,58 @@ public static IDateTimeExtractor GetJapaneseExtractor(DateTimeExtractors extract
569570
switch (extractorName)
570571
{
571572
case DateTimeExtractors.Date:
572-
return new DateTime.Japanese.JapaneseDateExtractorConfiguration();
573+
return new BaseCJKDateExtractor(new JapaneseDateExtractorConfiguration(defaultConfig));
573574
case DateTimeExtractors.Time:
574-
return new DateTime.Japanese.JapaneseTimeExtractorConfiguration();
575+
return new BaseCJKTimeExtractor(new JapaneseTimeExtractorConfiguration(defaultConfig));
575576
case DateTimeExtractors.DatePeriod:
576-
return new DateTime.Japanese.JapaneseDatePeriodExtractorConfiguration();
577+
return new BaseCJKDatePeriodExtractor(new JapaneseDatePeriodExtractorConfiguration(defaultConfig));
577578
case DateTimeExtractors.TimePeriod:
578-
return new DateTime.Japanese.JapaneseTimePeriodExtractorConfiguration();
579+
return new BaseCJKTimePeriodExtractor(new JapaneseTimePeriodExtractorConfiguration(defaultConfig));
579580
case DateTimeExtractors.DateTime:
580-
return new DateTime.Japanese.JapaneseDateTimeExtractorConfiguration();
581+
return new BaseCJKDateTimeExtractor(new JapaneseDateTimeExtractorConfiguration(defaultConfig));
581582
case DateTimeExtractors.DateTimePeriod:
582-
return new DateTime.Japanese.JapaneseDateTimePeriodExtractorConfiguration();
583+
return new BaseCJKDateTimePeriodExtractor(new JapaneseDateTimePeriodExtractorConfiguration(defaultConfig));
583584
case DateTimeExtractors.Duration:
584-
return new DateTime.Japanese.JapaneseDurationExtractorConfiguration();
585+
return new BaseCJKDurationExtractor(new JapaneseDurationExtractorConfiguration(defaultConfig));
585586
case DateTimeExtractors.Holiday:
586-
return new BaseHolidayExtractor(new DateTime.Japanese.JapaneseHolidayExtractorConfiguration(defaultConfig));
587+
return new BaseCJKHolidayExtractor(new JapaneseHolidayExtractorConfiguration(defaultConfig));
587588
case DateTimeExtractors.Set:
588-
return new DateTime.Japanese.JapaneseSetExtractorConfiguration();
589+
return new BaseCJKSetExtractor(new JapaneseSetExtractorConfiguration(defaultConfig));
589590
case DateTimeExtractors.Merged:
590-
return new DateTime.Japanese.JapaneseMergedExtractorConfiguration(defaultConfig);
591+
return new BaseCJKMergedDateTimeExtractor(new JapaneseMergedExtractorConfiguration(defaultConfig));
591592
case DateTimeExtractors.MergedSkipFromTo:
592-
return new DateTime.Japanese.JapaneseMergedExtractorConfiguration(skipConfig);
593+
return new BaseCJKMergedDateTimeExtractor(new JapaneseMergedExtractorConfiguration(skipConfig));
593594
}
594595

595596
throw new Exception($"Extractor '{extractorName}' for Japanese not supported");
596597
}
597598

598599
public static IDateTimeParser GetJapaneseParser(DateTimeParsers parserName)
599600
{
600-
601-
var config = new BaseDateTimeOptionsConfiguration(Culture.Japanese, DateTimeOptions.None);
601+
var config = new JapaneseCommonDateTimeParserConfiguration(new BaseDateTimeOptionsConfiguration(Culture.Japanese, DateTimeOptions.None));
602602

603603
switch (parserName)
604604
{
605605
case DateTimeParsers.Date:
606-
return new DateTime.Japanese.JapaneseDateParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
606+
return new BaseCJKDateParser(new JapaneseDateParserConfiguration(config));
607607
case DateTimeParsers.Time:
608-
return new DateTime.Japanese.JapaneseTimeParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
608+
return new BaseCJKTimeParser(new JapaneseTimeParserConfiguration(config));
609609
case DateTimeParsers.DatePeriod:
610-
return new DateTime.Japanese.JapaneseDatePeriodParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
610+
return new BaseCJKDatePeriodParser(new JapaneseDatePeriodParserConfiguration(config));
611611
case DateTimeParsers.TimePeriod:
612-
return new DateTime.Japanese.JapaneseTimePeriodParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
612+
return new BaseCJKTimePeriodParser(new JapaneseTimePeriodParserConfiguration(config));
613613
case DateTimeParsers.DateTime:
614-
return new DateTime.Japanese.JapaneseDateTimeParser(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
614+
return new BaseCJKDateTimeParser(new JapaneseDateTimeParserConfiguration(config));
615615
case DateTimeParsers.DateTimePeriod:
616-
return new DateTime.Japanese.JapaneseDateTimePeriodParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
616+
return new BaseCJKDateTimePeriodParser(new JapaneseDateTimePeriodParserConfiguration(config));
617617
case DateTimeParsers.Duration:
618-
return new DateTime.Japanese.JapaneseDurationParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
618+
return new BaseCJKDurationParser(new JapaneseDurationParserConfiguration(config));
619619
case DateTimeParsers.Holiday:
620-
return new DateTime.Japanese.JapaneseHolidayParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
620+
return new BaseCJKHolidayParser(new JapaneseHolidayParserConfiguration(config));
621621
case DateTimeParsers.Set:
622-
return new DateTime.Japanese.JapaneseSetParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
622+
return new BaseCJKSetParser(new JapaneseSetParserConfiguration(config));
623623
case DateTimeParsers.Merged:
624-
return new FullDateTimeParser(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
624+
return new BaseCJKMergedDateTimeParser(new JapaneseMergedParserConfiguration(config));
625625
}
626626

627627
throw new Exception($"Parser '{parserName}' for Japanese not supported");

.NET/Microsoft.Recognizers.Text.DateTime/Chinese/Extractors/ChineseDatePeriodExtractorConfiguration.cs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig
2525
// 2017.12, 2017-12, 2017/12, 12/2017
2626
public static readonly Regex PureNumYearAndMonth = new Regex(DateTimeDefinitions.PureNumYearAndMonth, RegexFlags);
2727

28+
public static readonly Regex SimpleYearAndMonth = new Regex(DateTimeDefinitions.SimpleYearAndMonth, RegexFlags);
29+
2830
public static readonly Regex OneWordPeriodRegex = new Regex(DateTimeDefinitions.OneWordPeriodRegex, RegexFlags);
2931

3032
public static readonly Regex WeekOfMonthRegex = new Regex(DateTimeDefinitions.WeekOfMonthRegex, RegexFlags);
@@ -41,6 +43,10 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig
4143

4244
public static readonly Regex MonthToMonthSuffixRequired = new Regex(DateTimeDefinitions.MonthToMonthSuffixRequired, RegexFlags);
4345

46+
public static readonly Regex DayToDay = new Regex(DateTimeDefinitions.DayToDay, RegexFlags);
47+
48+
public static readonly Regex DayRegexForPeriod = new Regex(DateTimeDefinitions.DayRegexForPeriod, RegexFlags);
49+
4450
public static readonly Regex PastRegex = new Regex(DateTimeDefinitions.PastRegex, RegexFlags);
4551

4652
public static readonly Regex FutureRegex = new Regex(DateTimeDefinitions.FutureRegex, RegexFlags);
@@ -51,6 +57,10 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig
5157

5258
public static readonly Regex DecadeRegex = new Regex(DateTimeDefinitions.DecadeRegex, RegexFlags);
5359

60+
public static readonly Regex SpecialMonthRegex = new Regex(DateTimeDefinitions.SpecialMonthRegex, RegexFlags);
61+
62+
public static readonly Regex SpecialYearRegex = new Regex(DateTimeDefinitions.SpecialYearRegex, RegexFlags);
63+
5464
public static readonly Regex DayRegex = new Regex(DateTimeDefinitions.DayRegex, RegexFlags);
5565
public static readonly Regex DayRegexInCJK = new Regex(DateTimeDefinitions.DatePeriodDayRegexInCJK, RegexFlags);
5666
public static readonly Regex MonthNumRegex = new Regex(DateTimeDefinitions.MonthNumRegex, RegexFlags);
@@ -80,6 +90,8 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig
8090
YearAndMonth,
8191
PureNumYearAndMonth,
8292
YearInCJKRegex,
93+
SpecialMonthRegex,
94+
SpecialYearRegex,
8395
WeekOfMonthRegex,
8496
SeasonWithYear,
8597
QuarterRegex,

0 commit comments

Comments
 (0)