Skip to content

Commit 2a77424

Browse files
committed
Release of python v1.6.2
1 parent 5991d29 commit 2a77424

3 files changed

Lines changed: 897 additions & 0 deletions

File tree

src/main/python/MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
include LICENSE
2+
# NOTE - all items here are copied from Java src/main/resources (ant script)
3+
# except -- *_py.cfg for xtemp and xcoord extractors
24
include opensextant/resources/geocoord_patterns_py.cfg
35
include opensextant/resources/country-names-2021.csv
46
include opensextant/resources/country-names-2015.csv
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
/*
2+
* NOTICE
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*
15+
* **************************************************************************
16+
* NOTICE
17+
* This software was produced for the U. S. Government under Contract No.
18+
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
19+
* Software and Noncommercial Computer Software Documentation Clause
20+
* 252.227-7014 (JUN 1995)
21+
*
22+
* (c) 2009-2013 The MITRE Corporation. All Rights Reserved.
23+
* **************************************************************************
24+
*/
25+
26+
// ALL Patterns below - defines, rules, etc. -- are for MATCHING.
27+
// Parsing of actual fields named in defines is done after matches are found.
28+
// Validation of parsed fields is last.
29+
30+
# Well-known month abbreviations.
31+
#DEFINE MON_ABBREV JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEPT?|OCT|NOV|DEC
32+
33+
# A name starting with 3 ASCII letters as above, but followed by other letters, possibly not English or ASCII.
34+
# Detection of month/day/year patterns with non-English month names is only a coincidence if they share a common prefix.
35+
# Locales for date patterns and language options could be explored further. But is beyond scope.
36+
#DEFINE MON_NAME JAN\w{0,6}|FEB\w{0,6}|MAR\w{0,2}|APR\w{0,3}|MAY|JUN\w{0,2}|JUL\w{0,2}|AUG\w{0,3}|SEP\w{0,6}|OCT\w{0,6}|NOV\w{0,6}|DEC\w{0,6}
37+
#DEFINE DAY_ENUM th|nd|rd|st
38+
39+
# Fixed length fields
40+
// In all practicality, year is 1xxx or 2xxx. Years 0001 to 0999 not really considered.
41+
#DEFINE YEAR [12]\d{3}
42+
#DEFINE YY '?\d\d
43+
44+
// Year/YY is 2-4 digits,... but could be 3. This is only used for matching. XTemp still validates matches.
45+
// '76
46+
//
47+
#DEFINE YEARYY '?\d{2}|\d{4}
48+
#DEFINE MM [01]\d
49+
#DEFINE DD [0-3]\d
50+
#DEFINE SHORT_TZ [A-Z]
51+
52+
#DEFINE hh [0-2]\d
53+
#DEFINE mm [0-5]\d
54+
#DEFINE ss [0-5]\d
55+
56+
57+
# Variable length
58+
#DEFINE DOM [0-3]?\d
59+
#DEFINE MONTH [01]?\d
60+
#DEFINE LONG_TZ [A-Z]{3,5}
61+
# Variable length Day/Month digits:
62+
#DEFINE DM1 [0-3]?\d
63+
#DEFINE DM2 [0-3]?\d
64+
65+
#DEFINE OF of|Of|OF
66+
# Do not use DSEP? (that is an optional separator.) Most field-separated patterns would be too noisy.
67+
#DEFINE DSEP1 [-/.]
68+
#DEFINE DSEP2 [-/.]
69+
70+
// ........................................
71+
// Month, Day, Year patterns, MDY
72+
// ........................................
73+
// FORM: DATE: MM/DD/YY
74+
#CLASS MDY opensextant.extractors.xtemporal.DateTimeMatch
75+
#RULE MDY 01 \b<DM1>/<DM2>/<YY>\b
76+
#TEST MDY 01 12/30/90
77+
#TEST MDY 01 DATE: 12/30/90
78+
#TEST MDY 01 DATE: 30/07/90 Jul 30, European locale
79+
#TEST MDY 01 30/12/15 Dec 30, European locale
80+
#TEST MDY 01 30/14/15 FAIL 14 is invalid
81+
#TEST MDY 01 13/30/90 FAIL bad MON
82+
#TEST MDY 01 12/32/90 FAIL bad DOM
83+
#TEST MDY 01 12/30/01
84+
#TEST MDY 01 12/30/00
85+
#TEST MDY 01 12/30/55
86+
#TEST MDY 01 1/30/55
87+
#TEST MDY 01 12/30/15
88+
#TEST MDY 01 12/1/15
89+
#TEST MDY 01 12/01/15
90+
#TEST MDY 01 15/01/15 Jan 15, 2015
91+
92+
93+
// FORM: DATE: MM/DD/YYYY
94+
#RULE MDY 02 \b<DM1>/<DM2>/<YEAR>\b
95+
#TEST MDY 02 12/30/1990
96+
#TEST MDY 02 DATE: 12/30/1990
97+
#TEST MDY 02 13/30/1990 FAIL bad MON
98+
#TEST MDY 02 12/32/1990 FAIL bad DOM
99+
#TEST MDY 02 12/30/2001
100+
#TEST MDY 02 12/30/0000 FAIL bad YYYY
101+
#TEST MDY 02 12/30/1955
102+
#TEST MDY 02 12/30/1915
103+
104+
// FORM: MMM DD, YYYY or MMM DD YYYY, MMM DD, YY, etc.
105+
#RULE MDY 03 \b<MON_NAME>[\s.]+<DOM>[\s,.]+<YEARYY>\b
106+
#TEST MDY 03 DEC 30, 1990
107+
#TEST MDY 03 DEC 30 1990
108+
#TEST MDY 03 DEC.30 1990
109+
#TEST MDY 03 DEC. 30 1990
110+
#TEST MDY 03 DEC.30.1990
111+
#TEST MDY 03 DEC 30 90
112+
#TEST MDY 03 DEC 30 990 FAIL bad year
113+
#TEST MDY 03 DEC 00 1990 FAIL bad DOM
114+
#TEST MDY 03 DEC 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected
115+
#TEST MDY 03 DECEMBER 30 1990
116+
#TEST MDY 03 DECMEBER 30 90
117+
#TEST MDY 03 DECIEMBRE 30 1990
118+
#TEST MDY 03 DECIEMBRE 00 1990 # FAIL no 00 day
119+
#TEST MDY 03 DECEMBER 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected
120+
121+
122+
// FORM: MMM, YYYY or Month, YYYY comma optional. 4-digit year required
123+
#RULE MDY 04 \b<MON_NAME>[\s,.]+<YEAR>\b
124+
#TEST MDY 04 DEC 1990
125+
#TEST MDY 04 DEC, 1990
126+
#TEST MDY 04 DEC. 1990
127+
#TEST MDY 04 DECEMBER, 1990
128+
#TEST MDY 04 DECIEMBRE, 1990
129+
#TEST MDY 04 DÉCEMBRE, 1990
130+
#TEST MDY 04 DÉC, 1990
131+
132+
// FORM: MMM of YYYY
133+
#RULE MDY 04a \b<MON_NAME>\s+<OF>\s+<YEAR>\b
134+
#TEST MDY 04a DEC of 1990
135+
#TEST MDY 04a DECEMBER of 1990
136+
137+
#RULE MDY 05 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY>\b
138+
#TEST MDY 05 30 DEC 1990
139+
#TEST MDY 05 30 DEC 90
140+
#TEST MDY 05 01 DEC 00
141+
#TEST MDY 05 01 DEC 02
142+
#TEST MDY 05 30 DECEMBER 1990
143+
#TEST MDY 05 30 DECMEBER 1990
144+
#TEST MDY 05 30 DECIEMBRE 1990
145+
146+
#RULE MDY 06a \b<MON_NAME>[\s.]+<DOM><DAY_ENUM>[\s,]+<YEAR>\b
147+
#TEST MDY 06a September 19th, 2017
148+
#TEST MDY 06a September 19th, 17 # FAIL
149+
#TEST MDY 06a September 19 th, 17 # FAIL
150+
#TEST MDY 06a Sept. 19th, 2017
151+
#TEST MDY 06a Sept 19th, 2017
152+
#TEST MDY 06a Sept 1st, 2017
153+
#TEST MDY 06a Sept 23rd, 2017
154+
#TEST MDY 06a Sept 15th, 2017
155+
#TEST MDY 06a Sept 22nd, 2017
156+
157+
#RULE MDY 06b \b<DOM><DAY_ENUM>\s+<OF>?\s*<MON_NAME>[\s,]+<YEAR>\b
158+
#TEST MDY 06b 19th September, 2017
159+
#TEST MDY 06b 19th of September, 2017
160+
161+
#CLASS DMYT opensextant.extractors.xtemporal.DateTimeMatch
162+
#RULE DMYT 01 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY> <hh>:?<mm>\b
163+
#TEST DMYT 01 30 DEC 1990 0400
164+
#TEST DMYT 01 30 DEC 90 0400
165+
#TEST DMYT 01 11 JUN 14 1815 06:15 PM, 11 JUNE 2014
166+
#TEST DMYT 01 25 March 2012 04:00
167+
#TEST DMYT 01 25 March, 2012 04:00
168+
169+
170+
171+
// FORM: DATE: DD-MON-YYYY
172+
#CLASS DMY opensextant.extractors.xtemporal.DateTimeMatch
173+
#RULE DMY 01 \b<DOM>-<MON_NAME>-<YEARYY>\b
174+
#TEST DMY 01 12-DEC-90
175+
#TEST DMY 01 12-DEC-1990
176+
177+
// FORM: DATE: DD MON YYYY
178+
#RULE DMY 02 \b<DOM>\s*<MON_NAME>\s*<YEARYY>\b
179+
#TEST DMY 02 12 DEC 90
180+
#TEST DMY 02 12 DEC 1990
181+
#TEST DMY 02 12DEC90
182+
#TEST DMY 02 12DEC1990
183+
#TEST DMY 02 12MARCH1999
184+
#TEST DMY 02 12FEBBRAIO1999
185+
#TEST DMY 02 12JUL1999
186+
#TEST DMY 02 12JULIO1999
187+
188+
// FORM: DATE: YYYY-MM-DD as it appears in free text of documents.
189+
// The limitations of this pattern are related to how it was used.
190+
// This is a relatively modern format; was this format used in text in 1700s?
191+
#CLASS YMD opensextant.extractors.xtemporal.DateTimeMatch
192+
#RULE YMD 01 \b<YEAR><DSEP1><MM><DSEP2><DOM>\b
193+
#TEST YMD 01 2001-11-11
194+
#TEST YMD 01 0001-04-34 # FAIL
195+
#TEST YMD 01 1001-04-30 # FAIL
196+
#TEST YMD 01 2001-04-30
197+
#TEST YMD 01 1990-04-30
198+
#TEST YMD 01 1790-04-30 # FAIL -- 1800 01 01 is earliest date for this pattern.
199+
#TEST YMD 01 a2001-04-30 # FAIL
200+
#TEST YMD 01 c2001-04-30 # FAIL
201+
#TEST YMD 01 42001-04-30 # FAIL
202+
203+
204+
// ........................................
205+
// DATE TIME PATTERNS, DTM
206+
// ........................................
207+
#CLASS DTM opensextant.extractors.xtemporal.DateTimeMatch
208+
209+
// FORM: A|O|P|R DDHHMMZ MMM YY
210+
#RULE DTM 01 \b<DD><hh><mm><SHORT_TZ>\s*<MON_ABBREV>\s*<YY>\b
211+
#TEST DTM 01 A 301400Z DEC 90
212+
#TEST DTM 01 R 301400Z DEC 90
213+
#TEST DTM 01 A 351400Z DEC 90 # FAIL day out of range
214+
215+
// FORM: YYYYMMDDTHHMMZ
216+
#RULE DTM 02 \b<YEAR><MM><DD>T<hh><mm><SHORT_TZ>\b
217+
#TEST DTM 02 20101230T1400Z
218+
219+
// FORM: YYYYMMDDTHHMM ZZZ
220+
#RULE DTM 02a \b<YEAR><MM><DD>T<hh><mm> <LONG_TZ>\b
221+
#TEST DTM 02a 20101230T1400 EST # UTC-0500. parses to 1900 UTC
222+
#TEST DTM 02a 20101230T1400 BNT # UTC+0800. parses to 0600 UTC
223+
#TEST DTM 02a 20101230T1400 XXX # FAIL -- invalid TZ
224+
#TEST DTM 02a 20101230T1400 PST # UTC-0800. parses to UTC
225+
226+
// FORM: YYYY-MM-DDTHH:MM:SS ... ISO Time. ISO 8601 uses "-", not "/". But should be validated in normalization.
227+
// DTM 04 is collapsed into this pattern.
228+
#RULE DTM 03 \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm>:<ss>
229+
#TEST DTM 03 2010-12-30T14:00:01:12
230+
#TEST DTM 03 2010-12-30T14:00:02
231+
232+
// FORM: YYYY-MM-DDTHH:MM ... ISO Time. See 03 above.
233+
#RULE DTM 03b \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm>
234+
#TEST DTM 03b 2010-12-30T14:01:11:12
235+
#TEST DTM 03b 2010-12-30T14:02:12
236+
#TEST DTM 03b 2010-12/30T14:03:13 # FAIL
237+
238+
// FORM: MM/DD/YY* HH:MM:SS.
239+
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock.
240+
#RULE DTM 05a \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm>:<ss>
241+
#TEST DTM 05a 12-30-20 14:00:00:12 # extra data associated with value?
242+
#TEST DTM 05a 12-30-2020 14:00:01
243+
#TEST DTM 05a 12/30-2020 14:00:02 # FAIL Test mixed punctuation.
244+
#TEST DTM 05a 12/30/2020 14:00:02 # Test mixed punctuation.
245+
#TEST DTM 05a 12/30/20 14:00 # Test mixed punctuation.
246+
247+
// FORM: MM/DD/YY* HH:MM.
248+
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock.
249+
#RULE DTM 05b \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm>
250+
#TEST DTM 05b 12-30-20 14:00:00:12 # extra data associated with value?
251+
#TEST DTM 05b 12-30-2020 14:01:01
252+
#TEST DTM 05b 12/30-2020 14:02:02 # FAIL Test mixed punctuation.
253+
#TEST DTM 05b 12/30/2020 14:03:02 # Test mixed punctuation.
254+
#TEST DTM 05b 12/30/20 14:04 # Test mixed punctuation.
255+
#TEST DTM 05b 12.30.20 14:04 # European convention for Date.
256+
#TEST DTM 05b 2.30.20 14:04 # FAIL No 30 FEB. European convention for Date. Non-zero padded
257+
#TEST DTM 05b 2.30/2020 14:04 # FAIL No 30 FEB., Mixed Separators. European convention for Date. Non-zero padded.
258+
#TEST DTM 05b 4.30.20 14:04 # European convention for Date. Non-zero padded
259+
#TEST DTM 05b 4.30/2020 14:04 # FAIL Mixed Separators. European convention for Date. Non-zero padded.
260+
261+
262+
// 8-digit date is great when used in short spans of text; But as a general pattern, this matches any 8 digit number.
263+
// Not very accurate on data or large texts.
264+
// FORM: YYYYMMDD
265+
//#RULE DTM xx \b<YEAR><MM><DD>\b
266+
//#TEST DTM xx 20101230
267+

0 commit comments

Comments
 (0)