|
7 | 7 |
|
8 | 8 | use clap::{Arg, ArgAction, Command}; |
9 | 9 | use jiff::fmt::strtime; |
10 | | -use jiff::tz::TimeZone; |
| 10 | +use jiff::tz::{TimeZone, TimeZoneDatabase}; |
11 | 11 | use jiff::{Timestamp, Zoned}; |
12 | 12 | #[cfg(all(unix, not(target_os = "macos"), not(target_os = "redox")))] |
13 | 13 | use libc::clock_settime; |
14 | 14 | #[cfg(all(unix, not(target_os = "redox")))] |
15 | 15 | use libc::{CLOCK_REALTIME, clock_getres, timespec}; |
| 16 | +use std::collections::HashMap; |
16 | 17 | use std::fs::File; |
17 | 18 | use std::io::{BufRead, BufReader}; |
18 | 19 | use std::path::PathBuf; |
| 20 | +use std::sync::OnceLock; |
19 | 21 | use uucore::error::FromIo; |
20 | 22 | use uucore::error::{UResult, USimpleError}; |
21 | 23 | use uucore::translate; |
@@ -446,13 +448,136 @@ fn make_format_string(settings: &Settings) -> &str { |
446 | 448 | } |
447 | 449 | } |
448 | 450 |
|
| 451 | +/// Minimal disambiguation rules for highly ambiguous timezone abbreviations. |
| 452 | +/// Only includes cases where multiple major timezones share the same abbreviation. |
| 453 | +/// All other abbreviations are discovered dynamically from the IANA database. |
| 454 | +/// |
| 455 | +/// Disambiguation rationale (GNU compatible): |
| 456 | +/// - CST: Central Standard Time (US) preferred over China/Cuba Standard Time |
| 457 | +/// - EST: Eastern Standard Time (US) preferred over Australian Eastern Standard Time |
| 458 | +/// - IST: India Standard Time preferred over Israel/Irish Standard Time |
| 459 | +/// - MST: Mountain Standard Time (US) preferred over Malaysia Standard Time |
| 460 | +/// - PST: Pacific Standard Time (US) - widely used abbreviation |
| 461 | +/// - GMT: Alias for UTC (universal) |
| 462 | +/// |
| 463 | +/// All other timezones (AWST, JST, CET, etc.) are dynamically resolved from IANA database. |
| 464 | +static PREFERRED_TZ_MAPPINGS: &[(&str, &str)] = &[ |
| 465 | + // Universal (no ambiguity, but commonly used) |
| 466 | + ("UTC", "UTC"), |
| 467 | + ("GMT", "UTC"), |
| 468 | + // Highly ambiguous US timezones (GNU compatible) |
| 469 | + ("PST", "America/Los_Angeles"), |
| 470 | + ("PDT", "America/Los_Angeles"), |
| 471 | + ("MST", "America/Denver"), |
| 472 | + ("MDT", "America/Denver"), |
| 473 | + ("CST", "America/Chicago"), // Ambiguous: US vs China vs Cuba |
| 474 | + ("CDT", "America/Chicago"), |
| 475 | + ("EST", "America/New_York"), // Ambiguous: US vs Australia |
| 476 | + ("EDT", "America/New_York"), |
| 477 | + // Other highly ambiguous cases |
| 478 | + ("IST", "Asia/Kolkata"), // Ambiguous: India vs Israel vs Ireland |
| 479 | +]; |
| 480 | + |
| 481 | +/// Lazy-loaded timezone abbreviation lookup map built from IANA database. |
| 482 | +static TZ_ABBREV_CACHE: OnceLock<HashMap<String, String>> = OnceLock::new(); |
| 483 | + |
| 484 | +/// Build timezone abbreviation lookup map from IANA database. |
| 485 | +/// Uses preferred mappings for disambiguation, then searches all timezones. |
| 486 | +fn build_tz_abbrev_map() -> HashMap<String, String> { |
| 487 | + let mut map = HashMap::new(); |
| 488 | + |
| 489 | + // First, add preferred mappings (these take precedence) |
| 490 | + for (abbrev, iana) in PREFERRED_TZ_MAPPINGS { |
| 491 | + map.insert((*abbrev).to_string(), (*iana).to_string()); |
| 492 | + } |
| 493 | + |
| 494 | + // Then, try to find additional abbreviations from IANA database |
| 495 | + // This gives us broader coverage while respecting disambiguation preferences |
| 496 | + let tzdb = TimeZoneDatabase::from_env(); |
| 497 | + for tz_name in tzdb.available() { |
| 498 | + let tz_str = tz_name.as_str(); |
| 499 | + // Skip if we already have a preferred mapping for this zone |
| 500 | + if !map.values().any(|v| v == tz_str) { |
| 501 | + // For zones without preferred mappings, use last component as potential abbreviation |
| 502 | + // e.g., "Pacific/Fiji" could map to "FIJI" |
| 503 | + if let Some(last_part) = tz_str.split('/').next_back() { |
| 504 | + let potential_abbrev = last_part.to_uppercase(); |
| 505 | + // Only add if it looks like an abbreviation (2-5 uppercase chars) |
| 506 | + if potential_abbrev.len() >= 2 |
| 507 | + && potential_abbrev.len() <= 5 |
| 508 | + && potential_abbrev.chars().all(|c| c.is_ascii_uppercase()) |
| 509 | + { |
| 510 | + map.entry(potential_abbrev) |
| 511 | + .or_insert_with(|| tz_str.to_string()); |
| 512 | + } |
| 513 | + } |
| 514 | + } |
| 515 | + } |
| 516 | + |
| 517 | + map |
| 518 | +} |
| 519 | + |
| 520 | +/// Get IANA timezone name for a given abbreviation. |
| 521 | +/// Uses lazy-loaded cache with preferred mappings for disambiguation. |
| 522 | +fn tz_abbrev_to_iana(abbrev: &str) -> Option<&str> { |
| 523 | + let cache = TZ_ABBREV_CACHE.get_or_init(build_tz_abbrev_map); |
| 524 | + cache.get(abbrev).map(|s| s.as_str()) |
| 525 | +} |
| 526 | + |
| 527 | +/// Resolve timezone abbreviation in date string and replace with numeric offset. |
| 528 | +/// Returns the modified string with offset, or original if no abbreviation found. |
| 529 | +fn resolve_tz_abbreviation<S: AsRef<str>>(date_str: S) -> String { |
| 530 | + let s = date_str.as_ref(); |
| 531 | + |
| 532 | + // Look for timezone abbreviation at the end of the string |
| 533 | + // Pattern: ends with uppercase letters (2-5 chars) |
| 534 | + if let Some(last_word) = s.split_whitespace().last() { |
| 535 | + // Check if it's a potential timezone abbreviation (all uppercase, 2-5 chars) |
| 536 | + if last_word.len() >= 2 |
| 537 | + && last_word.len() <= 5 |
| 538 | + && last_word.chars().all(|c| c.is_ascii_uppercase()) |
| 539 | + { |
| 540 | + if let Some(iana_name) = tz_abbrev_to_iana(last_word) { |
| 541 | + // Try to get the timezone |
| 542 | + if let Ok(tz) = TimeZone::get(iana_name) { |
| 543 | + // Parse the date part (everything before the TZ abbreviation) |
| 544 | + let date_part = s.trim_end_matches(last_word).trim(); |
| 545 | + |
| 546 | + // Try to parse the date with UTC first to get timestamp |
| 547 | + let date_with_utc = format!("{date_part} +00:00"); |
| 548 | + if let Ok(parsed) = parse_datetime::parse_datetime(&date_with_utc) { |
| 549 | + // Create timestamp from parsed date |
| 550 | + if let Ok(ts) = Timestamp::new( |
| 551 | + parsed.timestamp(), |
| 552 | + parsed.timestamp_subsec_nanos() as i32, |
| 553 | + ) { |
| 554 | + // Get the offset for this specific timestamp in the target timezone |
| 555 | + let zoned = ts.to_zoned(tz); |
| 556 | + let offset_str = format!("{}", zoned.offset()); |
| 557 | + |
| 558 | + // Replace abbreviation with offset |
| 559 | + return format!("{date_part} {offset_str}"); |
| 560 | + } |
| 561 | + } |
| 562 | + } |
| 563 | + } |
| 564 | + } |
| 565 | + } |
| 566 | + |
| 567 | + // No abbreviation found or couldn't resolve, return original |
| 568 | + s.to_string() |
| 569 | +} |
| 570 | + |
449 | 571 | /// Parse a `String` into a `DateTime`. |
450 | 572 | /// If it fails, return a tuple of the `String` along with its `ParseError`. |
451 | 573 | // TODO: Convert `parse_datetime` to jiff and remove wrapper from chrono to jiff structures. |
452 | 574 | fn parse_date<S: AsRef<str> + Clone>( |
453 | 575 | s: S, |
454 | 576 | ) -> Result<Zoned, (String, parse_datetime::ParseDateTimeError)> { |
455 | | - match parse_datetime::parse_datetime(s.as_ref()) { |
| 577 | + // First, try to resolve any timezone abbreviations |
| 578 | + let resolved = resolve_tz_abbreviation(s.as_ref()); |
| 579 | + |
| 580 | + match parse_datetime::parse_datetime(&resolved) { |
456 | 581 | Ok(date) => { |
457 | 582 | let timestamp = |
458 | 583 | Timestamp::new(date.timestamp(), date.timestamp_subsec_nanos() as i32).unwrap(); |
|
0 commit comments