Skip to content

Commit 1881b3f

Browse files
committed
managing duplicates by better validation. Could use rules or preference order of match pattern families...
1 parent a2894d9 commit 1881b3f

3 files changed

Lines changed: 47 additions & 26 deletions

File tree

src/main/python/opensextant/extractors/xcoord.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,10 @@ def normalize(self):
119119
class DMSOrdinate:
120120
SYMBOLS = {"°", "º", "'", "\"", ":", "lat", "lon", "geo", "coord", "deg"}
121121

122-
def __init__(self, axis: str, text: str, slots=None):
122+
def __init__(self, axis: str, text: str, fam: str, slots=None):
123123
self.axis = axis
124124
self.text = text
125+
self.pattern_family = fam
125126
self.slots = slots
126127
self.degrees = None
127128
self.min = None
@@ -134,7 +135,7 @@ def __init__(self, axis: str, text: str, slots=None):
134135
self.normalize()
135136

136137
def is_valid(self):
137-
if not self.degrees:
138+
if self.degrees is None:
138139
return False
139140
# Must have degrees, in range for the axis
140141
if self.axis == "lat":
@@ -144,9 +145,9 @@ def is_valid(self):
144145
if not -180 < self.degrees < 180:
145146
return False
146147
# Min and Secs must be in range if specified
147-
if self.min and not 0 < self.min < 60:
148+
if self.min is not None and not 0 <= self.min < 60:
148149
return False
149-
if self.seconds and not 0 < self.seconds < 60:
150+
if self.seconds is not None and not 0 <= self.seconds < 60:
150151
return False
151152

152153
return True
@@ -216,6 +217,14 @@ def _digest_slots(self, axis):
216217
"""
217218
Fields or slots are named xxxLatxx or xxxLonxx
218219
"""
220+
if self.pattern_family == "DMS":
221+
min_sec_sep = self.slots.get(f"ms{axis}Sep")
222+
deg_min_sep = self.slots.get(f"dm{axis}Sep")
223+
if min_sec_sep and deg_min_sep and min_sec_sep == "." and min_sec_sep != deg_min_sep:
224+
# valid coordinate, but separators like "DD MM.ss" suggest more DM pattern
225+
# whereas "DD.MM.SS" with consistent separators is DMS.
226+
return
227+
219228
# DEGREES
220229
deg = self.get_int(f"deg{axis}", "deg")
221230
deg2 = self.get_int(f"dmsDeg{axis}", "deg")
@@ -310,6 +319,7 @@ def __init__(self, *args, **kwargs):
310319
self.lat_ordinate = None
311320
self.lon_ordinate = None
312321
self.filter = None
322+
self.pattern_family = self.pattern_id.split("-", 1)[0]
313323

314324
def __str__(self):
315325
return f"{self.text}"
@@ -361,10 +371,11 @@ def filter_out(self, mgrs: GeocoordMatch) -> tuple:
361371
# - is not a recent date;
362372
# - is not a rate ('NNN per LB');
363373
# - is not time with 'sec'
374+
# Lexical filters:
364375
if not mgrs.is_valid:
376+
# parsed earlier as invalid.
365377
return True, "invalid"
366378

367-
# Lexical filters:
368379
if not (mgrs.text.isupper() and len(mgrs.text.replace(" ", "")) > 6):
369380
return True, "lexical"
370381
parts = set(mgrs.text.split())
@@ -405,18 +416,20 @@ def filter_out(self, dms: GeocoordMatch) -> tuple:
405416
Easy filter -- if puncutation matches, this is an easy pattern to ignore.
406417
:return: True if filtered out, false positive.
407418
"""
408-
if not dms.is_valid:
409-
return True, "invalid"
410-
if dms.text[0].isalpha():
419+
if dms.is_valid:
420+
if dms.text[0].isalpha():
421+
return False, None
422+
for fmt in self.date_formats:
423+
try:
424+
dt = arrow.get(dms.text, fmt)
425+
# Recency matters not. Tests are literal date formats
426+
return True, "date"
427+
except Exception as err:
428+
pass
429+
# Not filtered. Is valid.
411430
return False, None
412-
for fmt in self.date_formats:
413-
try:
414-
dt = arrow.get(dms.text, fmt)
415-
# Recency matters not. Tests are literal date formats
416-
return True, "date"
417-
except Exception as err:
418-
pass
419-
return False, None
431+
# Filter out. invalid.
432+
return True, "invalid"
420433

421434

422435
mgrs_filter = MGRSFilter()
@@ -524,8 +537,8 @@ def normalize(self):
524537
# < hemiLonPre >\s? < degLon > < dmLonSep >\s? < minLon > < fractMinLon >? < msLonSep >?
525538

526539
# TODO: conditions that invalidate this pattern?
527-
self.lat_ordinate = DMSOrdinate("lat", self.text, slots=self.attributes())
528-
self.lon_ordinate = DMSOrdinate("lon", self.text, slots=self.attributes())
540+
self.lat_ordinate = DMSOrdinate("lat", self.text, self.pattern_family, slots=self.attributes())
541+
self.lon_ordinate = DMSOrdinate("lon", self.text, self.pattern_family, slots=self.attributes())
529542
self._make_coordinate()
530543
self.validate()
531544

@@ -537,8 +550,8 @@ def __init__(self, *args, **kwargs):
537550

538551
def normalize(self):
539552
GeocoordMatch.normalize(self)
540-
self.lat_ordinate = DMSOrdinate("lat", self.text, slots=self.attributes())
541-
self.lon_ordinate = DMSOrdinate("lon", self.text, slots=self.attributes())
553+
self.lat_ordinate = DMSOrdinate("lat", self.text, self.pattern_family, slots=self.attributes())
554+
self.lon_ordinate = DMSOrdinate("lon", self.text, self.pattern_family, slots=self.attributes())
542555
self._make_coordinate()
543556
self.validate()
544557

@@ -571,7 +584,7 @@ def validate(self):
571584

572585
def normalize(self):
573586
GeocoordMatch.normalize(self)
574-
self.lat_ordinate = DMSOrdinate("lat", self.text, slots=self.attributes())
575-
self.lon_ordinate = DMSOrdinate("lon", self.text, slots=self.attributes())
587+
self.lat_ordinate = DMSOrdinate("lat", self.text, self.pattern_family, slots=self.attributes())
588+
self.lon_ordinate = DMSOrdinate("lon", self.text, self.pattern_family, slots=self.attributes())
576589
self._make_coordinate()
577590
self.validate()

src/main/python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
setup(
1616
name='opensextant',
17-
version='1.6.0',
17+
version='1.6.2',
1818

1919
description='OpenSextant APIs and Utilities',
2020
long_description=long_description,
Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
1-
21
from opensextant.extractors.xcoord import XCoord
32

43
tester = XCoord(debug=True)
54
# focused tests:
65
mgrs = "10 JAN 94"
76
matches = tester.extract(mgrs)
87
for m in matches:
9-
print (m, m.filtered_out)
8+
print(m, m.filtered_out)
9+
10+
dms = ["'18 51.1S 34 38.8W'",
11+
"08 00.4S 30 35.2W", # DM
12+
"08.00.4S 30.35.2W" # DMS pattern
13+
]
14+
for text in dms:
15+
matches = tester.extract(text)
16+
for m in matches:
17+
print(f"Text=[{m.text}]", m.pattern_id, "Filtered=", m.filtered_out, "Duplicate", m.is_duplicate)
1018

1119
results = XCoord(debug=True).default_tests()
1220
for res in results:
13-
print(res)
21+
print(res)

0 commit comments

Comments
 (0)