diff --git a/extractors/date.py b/extractors/date.py index b749cd5..542877d 100755 --- a/extractors/date.py +++ b/extractors/date.py @@ -4,104 +4,211 @@ import datetime, re class DateExtractionException(Exception): - pass + pass def extract(dateString): - """Return an dict with following keys: since, until, note(optional) - - >>> from pprint import pprint - >>> pprint(extract('10.05.2013')) - {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 5, 10)} - >>> pprint(extract('am 10.05.2013')) - {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 5, 10)} - - >>> pprint(extract('von 10.05.2013 bis 15.06.2013')) - {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} - >>> pprint(extract('ab 10.05.2013 bis 15.06.2013')) - {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} - >>> pprint(extract('seit 10.05.2013 bis 15.06.2013')) - {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} - >>> pprint(extract('10.05.2013 - 15.06.2013')) - {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} - - >>> pprint(extract('ab 10.05.2013')) - {'since': datetime.date(2013, 5, 10), 'until': None} - >>> pprint(extract('seit 10.05.2013')) - {'since': datetime.date(2013, 5, 10), 'until': None} - - >>> pprint(extract('bis 10.05.2013')) - {'since': None, 'until': datetime.date(2013, 5, 10)} - """ - data = {} - dateRegex = '(\d{1,2})\.(\d{1,2})\.(\d{2,4})' - specificDate = re.match('^(am)?\s*' + dateRegex + '$', dateString) - if specificDate: - tmp = specificDate.groups() - date = datetime.date( - int(tmp[3]), - int(tmp[2]), - int(tmp[1]) - ) - data['since'] = date - data['until'] = date - return data - - fromToDate = re.match('^(von|ab|seit)?\s*' + dateRegex + '\s*(bis|-)\s*' + dateRegex + ',?\s*(.*)$', dateString) - if fromToDate: - tmp = fromToDate.groups() - sinceDate = datetime.date( - int(tmp[3]), - int(tmp[2]), - int(tmp[1]) - ) - untilDate = datetime.date( - int(tmp[7]), - int(tmp[6]), - int(tmp[5]) - ) - data['since'] = sinceDate - data['until'] = untilDate - return data - - fromDate = re.match('^(ab|seit)?\s*' + dateRegex + '$', dateString) - if fromDate: - tmp = fromDate.groups() - date = datetime.date( - int(tmp[3]), - int(tmp[2]), - int(tmp[1]) - ) - data['since'] = date - data['until'] = None - return data - - untilDate = re.match('^(bis)?\s*' + dateRegex + '$', dateString) - if untilDate: - tmp = untilDate.groups() - date = datetime.date( - int(tmp[3]), - int(tmp[2]), - int(tmp[1]) - ) - data['since'] = None - data['until'] = date - return data - - untilEstimatedDate = re.match('^(seit)?\s*' + dateRegex + '\s*bis voraussichtlich\s*(.*)$', dateString) - if untilEstimatedDate: - tmp = untilEstimatedDate.groups() - date = datetime.date( - int(tmp[3]), - int(tmp[2]), - int(tmp[1]) - ) - data['since'] = date - data['until'] = None - data['notice'] = 'bis voraussichtlich ' + tmp[4] - return data - - raise DateExtractionException(dateString) + """ + Return a dict with following keys: since, until, note(optional) + + some examples can be found in 'date.py.test' + """ + + def intyear(yearstring): + """ + return a valid integer even if nothing is found in the date string + """ + year = int(yearstring if (len(yearstring)>0) else 1) + return year if (year>0) else 1 + + def checkDate(data_to_check): + """ + Check if date or time periode is valid + """ + #now = datetime.datetime.now() + today = datetime.date.today() + century = (today.year // 100) * 100 + + if ((data_to_check['until'] != None) and (data_to_check['since'] != None)): + if (data_to_check['since'].year < 100) and (data_to_check['since'].year != 1): + data_to_check['since'] = data_to_check['since'].replace(year=data_to_check['since'].year + century); + if (data_to_check['until'].year < 100) and (data_to_check['until'].year != 1): + data_to_check['until'] = data_to_check['until'].replace(year=data_to_check['until'].year + century); + + if (data_to_check['until'].year == 1): + if (abs((data_to_check['until'].replace(year=today.year) - today).days) < abs(today - data_to_check['until'].replace(year=today.year-1)).days): + data_to_check['until'] = data_to_check['until'].replace(year=today.year); + else: + data_to_check['until'] = data_to_check['until'].replace(year=today.year-1); + + if (data_to_check['since'].year == 1) and (data_to_check['until'].year != 1): + if (data_to_check['since'].month <= data_to_check['until'].month): + data_to_check['since'] = data_to_check['since'].replace(year=data_to_check['until'].year); + else: + data_to_check['since'] = data_to_check['since'].replace(year=data_to_check['until'].year-1); + + if (data_to_check['since'] > data_to_check['until']): + data_to_check['until'] = data_to_check['until'].replace(year=data_to_check['until'].year+1); + + if (data_to_check['until'] == None): + if (data_to_check['since'].year < 100) and (data_to_check['since'].year != 1): + data_to_check['since'] = data_to_check['since'].replace(year=data_to_check['since'].year + century); + if (data_to_check['since'].year == 1): + data_to_check['since'] = data_to_check['since'].replace(year=today.year); + + if (data_to_check['since'] == None): + if (data_to_check['until'].year < 100) and (data_to_check['until'].year != 1): + data_to_check['until'] = data_to_check['until'].replace(year=data_to_check['until'].year + century); + if (data_to_check['until'].year == 1): + data_to_check['until'] = data_to_check['until'].replace(year=today.year); + + return data_to_check + #end of: def checkDate(data_to_check) + + data = {} + dateRegex = '(\d{1,2})\.(\d{1,2})\.(\d{0,4})' + specificDate = re.match('^(am)?\s*' + dateRegex + '$', dateString) + if specificDate: + tmp = specificDate.groups() + date = datetime.date( + intyear(tmp[3]), + int(tmp[2]), + int(tmp[1]) + ) + data['since'] = date + data['until'] = date + checkDate(data) + return data + + fromToDate = re.match('^(von|ab|seit)?\s*' + dateRegex + '\s*(bis|-)\s*' + dateRegex + ',?\s*(.*)$', dateString) + if fromToDate: + tmp = fromToDate.groups() + sinceDate = datetime.date( + intyear(tmp[3]), + int(tmp[2]), + int(tmp[1]) + ) + untilDate = datetime.date( + intyear(tmp[7]), + int(tmp[6]), + int(tmp[5]) + ) + data['since'] = sinceDate + data['until'] = untilDate + checkDate(data) + return data + + fromDate = re.match('^(ab|seit)?\s*' + dateRegex + '$', dateString) + if fromDate: + tmp = fromDate.groups() + date = datetime.date( + intyear(tmp[3]), + int(tmp[2]), + int(tmp[1]) + ) + data['since'] = date + data['until'] = None + checkDate(data) + return data + + untilDate = re.match('^(bis)?\s*' + dateRegex + '$', dateString) + if untilDate: + tmp = untilDate.groups() + date = datetime.date( + intyear(tmp[3]), + int(tmp[2]), + int(tmp[1]) + ) + data['since'] = None + data['until'] = date + checkDate(data) + return data + + timerangeRegex = '(\d{1,2}[\.\:]\d{1,2}\s*-\s*\d{1,2}[\.\:]\d{1,2}\s*(Uhr))?' + specificDateWithTimerange = re.match('^(am)?\s*' + dateRegex + '\s*,\s*' + timerangeRegex + '$', dateString) + if specificDateWithTimerange: + tmp = specificDateWithTimerange.groups() + date = datetime.date( + intyear(tmp[3]), + int(tmp[2]), + int(tmp[1]) + ) + data['since'] = date + data['until'] = date + data['note'] = tmp[4] + checkDate(data) + return data + + middleRegex = '((Anfang|Mitte|Ende)?)' + monthRegex = '(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)?' + + def estimatedDate(year,month,middle,start): + if (month == 'Januar'): monthN = 1; + elif (month == 'Februar'): monthN = 2; + elif (month == 'März'): monthN = 3; + elif (month == 'April'): monthN = 4; + elif (month == 'Mai'): monthN = 5; + elif (month == 'Juni'): monthN = 6; + elif (month == 'Juli'): monthN = 7; + elif (month == 'August'): monthN = 8; + elif (month == 'September'): monthN = 9; + elif (month == 'Oktober'): monthN = 10; + elif (month == 'November'): monthN = 11; + elif (month == 'Dezember'): monthN = 12; + else: monthN = 0; + + days_per_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] + if (monthN != 0): lastDay = days_per_month[monthN - 1]; + else: lastDay = 28; + + if (monthN != 0): + if (middle == 'Ende'): day = lastDay; + elif (middle == 'Mitte'): day = 15; + elif (middle == 'Anfang'): day = 1; + else: + if start: day = 1; + else: day = lastDay + else: + if (middle == 'Ende'): day = 31; monthN = 12; + elif (middle == 'Mitte'): day = 1; monthN = 7; + elif (middle == 'Anfang'): day = 1; monthN = 1; + else: + if start: day = 1; monthN = 1; + else: day = 31; monthN = 12; + + return datetime.date(year, monthN, day) + + untilEstimatedDate = re.match('^(ab|seit)?\s*' + dateRegex + '\s*bis voraussichtlich\s*' + middleRegex + '\s*'+ monthRegex + '\s*(\d{0,4})(.*)$', dateString) + if untilEstimatedDate: + tmp = untilEstimatedDate.groups() + sinceDate = datetime.date( + intyear(tmp[3]), + int(tmp[2]), + int(tmp[1]) + ) + untilDate = estimatedDate(intyear(tmp[7]),tmp[6],tmp[5],False) + data['since'] = sinceDate + data['until'] = untilDate + data['note'] = 'voraussichtliches Enddatum' + checkDate(data) + return data + + sinceEstimatedDate = re.match('^voraussichtlich ab\s*' + middleRegex + '\s*'+ monthRegex + '\s*(\d{0,4})\s*bis\s*' + dateRegex + '$', dateString) + if sinceEstimatedDate: + tmp = sinceEstimatedDate.groups() + sinceDate = estimatedDate(intyear(tmp[3]),tmp[2],tmp[1],True) + untilDate = datetime.date( + intyear(tmp[6]), + int(tmp[5]), + int(tmp[4]) + ) + data['since'] = sinceDate + data['until'] = untilDate + data['note'] = 'voraussichtliches Startdatum' + checkDate(data) + return data + + raise DateExtractionException(dateString) if __name__ == "__main__": - import doctest - doctest.testmod() + import doctest + doctest.testfile("date.py.test") diff --git a/extractors/date.py.test b/extractors/date.py.test new file mode 100644 index 0000000..ee7e152 --- /dev/null +++ b/extractors/date.py.test @@ -0,0 +1,113 @@ +test cases for extractors.date +====================== + + >>> from pprint import pprint + >>> from date import extract + +single day: + >>> pprint(extract('10.5.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 5, 10)} + >>> pprint(extract('1.5.13')) + {'since': datetime.date(2013, 5, 1), 'until': datetime.date(2013, 5, 1)} + >>> pprint(extract('1.5.')) + {'since': datetime.date(2015, 5, 1), 'until': datetime.date(2015, 5, 1)} + >>> pprint(extract('10.05.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 5, 10)} + >>> pprint(extract('am 10.05.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 5, 10)} + +time range: + >>> pprint(extract('von 10.05.2013 bis 15.06.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('ab 10.05.2013 bis 15.06.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('seit 10.05.2013 bis 15.06.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('10.05.2013 - 15.06.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('10.05.13 - 15.06.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('10.05. - 15.06.2013')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('10.05.13 - 15.06.13')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('10.05. - 15.06.')) + {'since': datetime.date(2015, 5, 10), 'until': datetime.date(2015, 6, 15)} + >>> pprint(extract('10.05.2013 - 15.06.13')) + {'since': datetime.date(2013, 5, 10), 'until': datetime.date(2013, 6, 15)} + >>> pprint(extract('10.05.2014 - 15.06.2015')) + {'since': datetime.date(2014, 5, 10), 'until': datetime.date(2015, 6, 15)} + +negative time range with start date in the past: + >>> pprint(extract('10.10.2014 - 15.02.2014')) + {'since': datetime.date(2014, 10, 10), 'until': datetime.date(2015, 2, 15)} + >>> pprint(extract('10.10.14 - 15.02.2014')) + {'since': datetime.date(2014, 10, 10), 'until': datetime.date(2015, 2, 15)} + >>> pprint(extract('10.10.14 - 15.01.2014')) + {'since': datetime.date(2014, 10, 10), 'until': datetime.date(2015, 1, 15)} + >>> pprint(extract('10.10. - 15.02.2015')) + {'since': datetime.date(2014, 10, 10), 'until': datetime.date(2015, 2, 15)} + >>> pprint(extract('10.10. - 15.01.2015')) + {'since': datetime.date(2014, 10, 10), 'until': datetime.date(2015, 1, 15)} + >>> pprint(extract('10.1. - 15.2.2015')) + {'since': datetime.date(2015, 1, 10), 'until': datetime.date(2015, 2, 15)} + >>> pprint(extract('24.12. - 30.12.2014')) + {'since': datetime.date(2014, 12, 24), 'until': datetime.date(2014, 12, 30)} + +negative time range with start date in the future: + >>> pprint(extract('10.10.2015 - 15.02.2015')) + {'since': datetime.date(2015, 10, 10), 'until': datetime.date(2016, 2, 15)} + +start date: + >>> pprint(extract('ab 10.05.2013')) + {'since': datetime.date(2013, 5, 10), 'until': None} + >>> pprint(extract('seit 10.05.2013')) + {'since': datetime.date(2013, 5, 10), 'until': None} + >>> pprint(extract('ab 10.05.')) + {'since': datetime.date(2015, 5, 10), 'until': None} + +end date: + >>> pprint(extract('bis 10.05.2013')) + {'since': None, 'until': datetime.date(2013, 5, 10)} + >>> pprint(extract('bis 10.05.')) + {'since': None, 'until': datetime.date(2015, 5, 10)} + +given time: + >>> pprint(extract('30.11.2014, 14.00 - 18.30 Uhr')) + {'note': '14.00 - 18.30 Uhr', + 'since': datetime.date(2014, 11, 30), + 'until': datetime.date(2014, 11, 30)} + +estimated: + >>> pprint(extract('voraussichtlich ab Anfang März 2015 bis 06.08.2016')) + {'note': 'voraussichtliches Startdatum', + 'since': datetime.date(2015, 3, 1), + 'until': datetime.date(2016, 8, 6)} + >>> pprint(extract('voraussichtlich ab Anfang 2015 bis 06.08.2016')) + {'note': 'voraussichtliches Startdatum', + 'since': datetime.date(2015, 1, 1), + 'until': datetime.date(2016, 8, 6)} + >>> pprint(extract('voraussichtlich ab März 2015 bis 06.08.2016')) + {'note': 'voraussichtliches Startdatum', + 'since': datetime.date(2015, 3, 1), + 'until': datetime.date(2016, 8, 6)} + >>> pprint(extract('voraussichtlich ab 2015 bis 06.08.2016')) + {'note': 'voraussichtliches Startdatum', + 'since': datetime.date(2015, 1, 1), + 'until': datetime.date(2016, 8, 6)} + >>> pprint(extract('06.08.2015 bis voraussichtlich Ende März 2016')) + {'note': 'voraussichtliches Enddatum', + 'since': datetime.date(2015, 8, 6), + 'until': datetime.date(2016, 3, 31)} + >>> pprint(extract('06.08.2015 bis voraussichtlich Ende 2016')) + {'note': 'voraussichtliches Enddatum', + 'since': datetime.date(2015, 8, 6), + 'until': datetime.date(2016, 12, 31)} + >>> pprint(extract('06.08.2015 bis voraussichtlich November 2016')) + {'note': 'voraussichtliches Enddatum', + 'since': datetime.date(2015, 8, 6), + 'until': datetime.date(2016, 11, 30)} + >>> pprint(extract('06.08.2015 bis voraussichtlich 2016')) + {'note': 'voraussichtliches Enddatum', + 'since': datetime.date(2015, 8, 6), + 'until': datetime.date(2016, 12, 31)}