|
| 1 | +/* |
| 2 | + * NOTICE |
| 3 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | + * you may not use this file except in compliance with the License. |
| 5 | + * You may obtain a copy of the License at |
| 6 | + * |
| 7 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | + * |
| 9 | + * Unless required by applicable law or agreed to in writing, software |
| 10 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | + * See the License for the specific language governing permissions and |
| 13 | + * limitations under the License. |
| 14 | + * |
| 15 | + * ************************************************************************** |
| 16 | + * NOTICE |
| 17 | + * This software was produced for the U. S. Government under Contract No. |
| 18 | + * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer |
| 19 | + * Software and Noncommercial Computer Software Documentation Clause |
| 20 | + * 252.227-7014 (JUN 1995) |
| 21 | + * |
| 22 | + * (c) 2009-2013 The MITRE Corporation. All Rights Reserved. |
| 23 | + * ************************************************************************** |
| 24 | + */ |
| 25 | + |
| 26 | +// ALL Patterns below - defines, rules, etc. -- are for MATCHING. |
| 27 | +// Parsing of actual fields named in defines is done after matches are found. |
| 28 | +// Validation of parsed fields is last. |
| 29 | + |
| 30 | +# Well-known month abbreviations. |
| 31 | +#DEFINE MON_ABBREV JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEPT?|OCT|NOV|DEC |
| 32 | + |
| 33 | +# A name starting with 3 ASCII letters as above, but followed by other letters, possibly not English or ASCII. |
| 34 | +# Detection of month/day/year patterns with non-English month names is only a coincidence if they share a common prefix. |
| 35 | +# Locales for date patterns and language options could be explored further. But is beyond scope. |
| 36 | +#DEFINE MON_NAME JAN\w{0,6}|FEB\w{0,6}|MAR\w{0,2}|APR\w{0,3}|MAY|JUN\w{0,2}|JUL\w{0,2}|AUG\w{0,3}|SEP\w{0,6}|OCT\w{0,6}|NOV\w{0,6}|DEC\w{0,6} |
| 37 | +#DEFINE DAY_ENUM th|nd|rd|st |
| 38 | + |
| 39 | +# Fixed length fields |
| 40 | +// In all practicality, year is 1xxx or 2xxx. Years 0001 to 0999 not really considered. |
| 41 | +#DEFINE YEAR [12]\d{3} |
| 42 | +#DEFINE YY '?\d\d |
| 43 | + |
| 44 | +// Year/YY is 2-4 digits,... but could be 3. This is only used for matching. XTemp still validates matches. |
| 45 | +// '76 |
| 46 | +// |
| 47 | +#DEFINE YEARYY '?\d{2}|\d{4} |
| 48 | +#DEFINE MM [01]\d |
| 49 | +#DEFINE DD [0-3]\d |
| 50 | +#DEFINE SHORT_TZ [A-Z] |
| 51 | + |
| 52 | +#DEFINE hh [0-2]\d |
| 53 | +#DEFINE mm [0-5]\d |
| 54 | +#DEFINE ss [0-5]\d |
| 55 | + |
| 56 | + |
| 57 | +# Variable length |
| 58 | +#DEFINE DOM [0-3]?\d |
| 59 | +#DEFINE MONTH [01]?\d |
| 60 | +#DEFINE LONG_TZ [A-Z]{3,5} |
| 61 | +# Variable length Day/Month digits: |
| 62 | +#DEFINE DM1 [0-3]?\d |
| 63 | +#DEFINE DM2 [0-3]?\d |
| 64 | + |
| 65 | +#DEFINE OF of|Of|OF |
| 66 | +# Do not use DSEP? (that is an optional separator.) Most field-separated patterns would be too noisy. |
| 67 | +#DEFINE DSEP1 [-/.] |
| 68 | +#DEFINE DSEP2 [-/.] |
| 69 | + |
| 70 | +// ........................................ |
| 71 | +// Month, Day, Year patterns, MDY |
| 72 | +// ........................................ |
| 73 | +// FORM: DATE: MM/DD/YY |
| 74 | +#CLASS MDY opensextant.extractors.xtemporal.DateTimeMatch |
| 75 | +#RULE MDY 01 \b<DM1>/<DM2>/<YY>\b |
| 76 | +#TEST MDY 01 12/30/90 |
| 77 | +#TEST MDY 01 DATE: 12/30/90 |
| 78 | +#TEST MDY 01 DATE: 30/07/90 Jul 30, European locale |
| 79 | +#TEST MDY 01 30/12/15 Dec 30, European locale |
| 80 | +#TEST MDY 01 30/14/15 FAIL 14 is invalid |
| 81 | +#TEST MDY 01 13/30/90 FAIL bad MON |
| 82 | +#TEST MDY 01 12/32/90 FAIL bad DOM |
| 83 | +#TEST MDY 01 12/30/01 |
| 84 | +#TEST MDY 01 12/30/00 |
| 85 | +#TEST MDY 01 12/30/55 |
| 86 | +#TEST MDY 01 1/30/55 |
| 87 | +#TEST MDY 01 12/30/15 |
| 88 | +#TEST MDY 01 12/1/15 |
| 89 | +#TEST MDY 01 12/01/15 |
| 90 | +#TEST MDY 01 15/01/15 Jan 15, 2015 |
| 91 | + |
| 92 | + |
| 93 | +// FORM: DATE: MM/DD/YYYY |
| 94 | +#RULE MDY 02 \b<DM1>/<DM2>/<YEAR>\b |
| 95 | +#TEST MDY 02 12/30/1990 |
| 96 | +#TEST MDY 02 DATE: 12/30/1990 |
| 97 | +#TEST MDY 02 13/30/1990 FAIL bad MON |
| 98 | +#TEST MDY 02 12/32/1990 FAIL bad DOM |
| 99 | +#TEST MDY 02 12/30/2001 |
| 100 | +#TEST MDY 02 12/30/0000 FAIL bad YYYY |
| 101 | +#TEST MDY 02 12/30/1955 |
| 102 | +#TEST MDY 02 12/30/1915 |
| 103 | + |
| 104 | +// FORM: MMM DD, YYYY or MMM DD YYYY, MMM DD, YY, etc. |
| 105 | +#RULE MDY 03 \b<MON_NAME>[\s.]+<DOM>[\s,.]+<YEARYY>\b |
| 106 | +#TEST MDY 03 DEC 30, 1990 |
| 107 | +#TEST MDY 03 DEC 30 1990 |
| 108 | +#TEST MDY 03 DEC.30 1990 |
| 109 | +#TEST MDY 03 DEC. 30 1990 |
| 110 | +#TEST MDY 03 DEC.30.1990 |
| 111 | +#TEST MDY 03 DEC 30 90 |
| 112 | +#TEST MDY 03 DEC 30 990 FAIL bad year |
| 113 | +#TEST MDY 03 DEC 00 1990 FAIL bad DOM |
| 114 | +#TEST MDY 03 DEC 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected |
| 115 | +#TEST MDY 03 DECEMBER 30 1990 |
| 116 | +#TEST MDY 03 DECMEBER 30 90 |
| 117 | +#TEST MDY 03 DECIEMBRE 30 1990 |
| 118 | +#TEST MDY 03 DECIEMBRE 00 1990 # FAIL no 00 day |
| 119 | +#TEST MDY 03 DECEMBER 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected |
| 120 | + |
| 121 | + |
| 122 | +// FORM: MMM, YYYY or Month, YYYY comma optional. 4-digit year required |
| 123 | +#RULE MDY 04 \b<MON_NAME>[\s,.]+<YEAR>\b |
| 124 | +#TEST MDY 04 DEC 1990 |
| 125 | +#TEST MDY 04 DEC, 1990 |
| 126 | +#TEST MDY 04 DEC. 1990 |
| 127 | +#TEST MDY 04 DECEMBER, 1990 |
| 128 | +#TEST MDY 04 DECIEMBRE, 1990 |
| 129 | +#TEST MDY 04 DÉCEMBRE, 1990 |
| 130 | +#TEST MDY 04 DÉC, 1990 |
| 131 | + |
| 132 | +// FORM: MMM of YYYY |
| 133 | +#RULE MDY 04a \b<MON_NAME>\s+<OF>\s+<YEAR>\b |
| 134 | +#TEST MDY 04a DEC of 1990 |
| 135 | +#TEST MDY 04a DECEMBER of 1990 |
| 136 | + |
| 137 | +#RULE MDY 05 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY>\b |
| 138 | +#TEST MDY 05 30 DEC 1990 |
| 139 | +#TEST MDY 05 30 DEC 90 |
| 140 | +#TEST MDY 05 01 DEC 00 |
| 141 | +#TEST MDY 05 01 DEC 02 |
| 142 | +#TEST MDY 05 30 DECEMBER 1990 |
| 143 | +#TEST MDY 05 30 DECMEBER 1990 |
| 144 | +#TEST MDY 05 30 DECIEMBRE 1990 |
| 145 | + |
| 146 | +#RULE MDY 06a \b<MON_NAME>[\s.]+<DOM><DAY_ENUM>[\s,]+<YEAR>\b |
| 147 | +#TEST MDY 06a September 19th, 2017 |
| 148 | +#TEST MDY 06a September 19th, 17 # FAIL |
| 149 | +#TEST MDY 06a September 19 th, 17 # FAIL |
| 150 | +#TEST MDY 06a Sept. 19th, 2017 |
| 151 | +#TEST MDY 06a Sept 19th, 2017 |
| 152 | +#TEST MDY 06a Sept 1st, 2017 |
| 153 | +#TEST MDY 06a Sept 23rd, 2017 |
| 154 | +#TEST MDY 06a Sept 15th, 2017 |
| 155 | +#TEST MDY 06a Sept 22nd, 2017 |
| 156 | + |
| 157 | +#RULE MDY 06b \b<DOM><DAY_ENUM>\s+<OF>?\s*<MON_NAME>[\s,]+<YEAR>\b |
| 158 | +#TEST MDY 06b 19th September, 2017 |
| 159 | +#TEST MDY 06b 19th of September, 2017 |
| 160 | + |
| 161 | +#CLASS DMYT opensextant.extractors.xtemporal.DateTimeMatch |
| 162 | +#RULE DMYT 01 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY> <hh>:?<mm>\b |
| 163 | +#TEST DMYT 01 30 DEC 1990 0400 |
| 164 | +#TEST DMYT 01 30 DEC 90 0400 |
| 165 | +#TEST DMYT 01 11 JUN 14 1815 06:15 PM, 11 JUNE 2014 |
| 166 | +#TEST DMYT 01 25 March 2012 04:00 |
| 167 | +#TEST DMYT 01 25 March, 2012 04:00 |
| 168 | + |
| 169 | + |
| 170 | + |
| 171 | +// FORM: DATE: DD-MON-YYYY |
| 172 | +#CLASS DMY opensextant.extractors.xtemporal.DateTimeMatch |
| 173 | +#RULE DMY 01 \b<DOM>-<MON_NAME>-<YEARYY>\b |
| 174 | +#TEST DMY 01 12-DEC-90 |
| 175 | +#TEST DMY 01 12-DEC-1990 |
| 176 | + |
| 177 | +// FORM: DATE: DD MON YYYY |
| 178 | +#RULE DMY 02 \b<DOM>\s*<MON_NAME>\s*<YEARYY>\b |
| 179 | +#TEST DMY 02 12 DEC 90 |
| 180 | +#TEST DMY 02 12 DEC 1990 |
| 181 | +#TEST DMY 02 12DEC90 |
| 182 | +#TEST DMY 02 12DEC1990 |
| 183 | +#TEST DMY 02 12MARCH1999 |
| 184 | +#TEST DMY 02 12FEBBRAIO1999 |
| 185 | +#TEST DMY 02 12JUL1999 |
| 186 | +#TEST DMY 02 12JULIO1999 |
| 187 | + |
| 188 | +// FORM: DATE: YYYY-MM-DD as it appears in free text of documents. |
| 189 | +// The limitations of this pattern are related to how it was used. |
| 190 | +// This is a relatively modern format; was this format used in text in 1700s? |
| 191 | +#CLASS YMD opensextant.extractors.xtemporal.DateTimeMatch |
| 192 | +#RULE YMD 01 \b<YEAR><DSEP1><MM><DSEP2><DOM>\b |
| 193 | +#TEST YMD 01 2001-11-11 |
| 194 | +#TEST YMD 01 0001-04-34 # FAIL |
| 195 | +#TEST YMD 01 1001-04-30 # FAIL |
| 196 | +#TEST YMD 01 2001-04-30 |
| 197 | +#TEST YMD 01 1990-04-30 |
| 198 | +#TEST YMD 01 1790-04-30 # FAIL -- 1800 01 01 is earliest date for this pattern. |
| 199 | +#TEST YMD 01 a2001-04-30 # FAIL |
| 200 | +#TEST YMD 01 c2001-04-30 # FAIL |
| 201 | +#TEST YMD 01 42001-04-30 # FAIL |
| 202 | + |
| 203 | + |
| 204 | +// ........................................ |
| 205 | +// DATE TIME PATTERNS, DTM |
| 206 | +// ........................................ |
| 207 | +#CLASS DTM opensextant.extractors.xtemporal.DateTimeMatch |
| 208 | + |
| 209 | +// FORM: A|O|P|R DDHHMMZ MMM YY |
| 210 | +#RULE DTM 01 \b<DD><hh><mm><SHORT_TZ>\s*<MON_ABBREV>\s*<YY>\b |
| 211 | +#TEST DTM 01 A 301400Z DEC 90 |
| 212 | +#TEST DTM 01 R 301400Z DEC 90 |
| 213 | +#TEST DTM 01 A 351400Z DEC 90 # FAIL day out of range |
| 214 | + |
| 215 | +// FORM: YYYYMMDDTHHMMZ |
| 216 | +#RULE DTM 02 \b<YEAR><MM><DD>T<hh><mm><SHORT_TZ>\b |
| 217 | +#TEST DTM 02 20101230T1400Z |
| 218 | + |
| 219 | +// FORM: YYYYMMDDTHHMM ZZZ |
| 220 | +#RULE DTM 02a \b<YEAR><MM><DD>T<hh><mm> <LONG_TZ>\b |
| 221 | +#TEST DTM 02a 20101230T1400 EST # UTC-0500. parses to 1900 UTC |
| 222 | +#TEST DTM 02a 20101230T1400 BNT # UTC+0800. parses to 0600 UTC |
| 223 | +#TEST DTM 02a 20101230T1400 XXX # FAIL -- invalid TZ |
| 224 | +#TEST DTM 02a 20101230T1400 PST # UTC-0800. parses to UTC |
| 225 | + |
| 226 | +// FORM: YYYY-MM-DDTHH:MM:SS ... ISO Time. ISO 8601 uses "-", not "/". But should be validated in normalization. |
| 227 | +// DTM 04 is collapsed into this pattern. |
| 228 | +#RULE DTM 03 \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm>:<ss> |
| 229 | +#TEST DTM 03 2010-12-30T14:00:01:12 |
| 230 | +#TEST DTM 03 2010-12-30T14:00:02 |
| 231 | + |
| 232 | +// FORM: YYYY-MM-DDTHH:MM ... ISO Time. See 03 above. |
| 233 | +#RULE DTM 03b \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm> |
| 234 | +#TEST DTM 03b 2010-12-30T14:01:11:12 |
| 235 | +#TEST DTM 03b 2010-12-30T14:02:12 |
| 236 | +#TEST DTM 03b 2010-12/30T14:03:13 # FAIL |
| 237 | + |
| 238 | +// FORM: MM/DD/YY* HH:MM:SS. |
| 239 | +// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock. |
| 240 | +#RULE DTM 05a \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm>:<ss> |
| 241 | +#TEST DTM 05a 12-30-20 14:00:00:12 # extra data associated with value? |
| 242 | +#TEST DTM 05a 12-30-2020 14:00:01 |
| 243 | +#TEST DTM 05a 12/30-2020 14:00:02 # FAIL Test mixed punctuation. |
| 244 | +#TEST DTM 05a 12/30/2020 14:00:02 # Test mixed punctuation. |
| 245 | +#TEST DTM 05a 12/30/20 14:00 # Test mixed punctuation. |
| 246 | + |
| 247 | +// FORM: MM/DD/YY* HH:MM. |
| 248 | +// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock. |
| 249 | +#RULE DTM 05b \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm> |
| 250 | +#TEST DTM 05b 12-30-20 14:00:00:12 # extra data associated with value? |
| 251 | +#TEST DTM 05b 12-30-2020 14:01:01 |
| 252 | +#TEST DTM 05b 12/30-2020 14:02:02 # FAIL Test mixed punctuation. |
| 253 | +#TEST DTM 05b 12/30/2020 14:03:02 # Test mixed punctuation. |
| 254 | +#TEST DTM 05b 12/30/20 14:04 # Test mixed punctuation. |
| 255 | +#TEST DTM 05b 12.30.20 14:04 # European convention for Date. |
| 256 | +#TEST DTM 05b 2.30.20 14:04 # FAIL No 30 FEB. European convention for Date. Non-zero padded |
| 257 | +#TEST DTM 05b 2.30/2020 14:04 # FAIL No 30 FEB., Mixed Separators. European convention for Date. Non-zero padded. |
| 258 | +#TEST DTM 05b 4.30.20 14:04 # European convention for Date. Non-zero padded |
| 259 | +#TEST DTM 05b 4.30/2020 14:04 # FAIL Mixed Separators. European convention for Date. Non-zero padded. |
| 260 | + |
| 261 | + |
| 262 | +// 8-digit date is great when used in short spans of text; But as a general pattern, this matches any 8 digit number. |
| 263 | +// Not very accurate on data or large texts. |
| 264 | +// FORM: YYYYMMDD |
| 265 | +//#RULE DTM xx \b<YEAR><MM><DD>\b |
| 266 | +//#TEST DTM xx 20101230 |
| 267 | + |
0 commit comments