Skip to content

Commit 7795ba9

Browse files
fix: Squarespace timezone regex cannot match nested context JSON (#255)
The regex in PageVenueExtractor::extractTimezone() used `\{[^}]*"timeZone"` which cannot cross any `}` boundary. Real Squarespace SQUARESPACE_CONTEXT JSON always nests multiple objects (betaFeatureFlags, rollups, merchandisingSettings, userAccountsSettings, …) before website.timeZone, so the pattern never matched live pages. The cascade was: empty timezone → DateTimeParser::parseUtc() returned empty → event[startDate] empty → fragile description-text / AI-vision fallbacks → different runs produced different dates for the same show → off-by-one duplicates accumulated on the calendar (~2,245 upcoming-event dupes as of writing). Verified against https://www.theroyalamerican.com/schedule: old regex /\{[^}]*"timeZone"…/s : NO MATCH new regex /\{.*?"timeZone"…/s : MATCHED America/New_York Defense in depth: DateTimeParser::parseUtc() no longer silently returns an empty result when timezone is empty/invalid. It now falls back to the WP site timezone and emits a datamachine_log warning so the failure surfaces instead of cascading into date drift. All three call sites (BaseExtractor, EventImportHandler) benefit from this. Fixes #254 Co-authored-by: homeboy-ci[bot] <266378653+homeboy-ci[bot]@users.noreply.github.com>
1 parent c22eecf commit 7795ba9

5 files changed

Lines changed: 9656 additions & 6 deletions

File tree

inc/Core/DateTimeParser.php

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ class DateTimeParser {
2727
* Use when API returns UTC times with a separate timezone field.
2828
* Example: Dice.fm returns "2026-01-04T02:30:00Z" with timezone "America/Chicago"
2929
*
30+
* When `$timezone` is empty or invalid, falls back to the site timezone
31+
* (`wp_timezone()`) and emits a `datamachine_log` warning. This prevents
32+
* upstream regex / extractor bugs from silently destroying the date and
33+
* cascading into off-by-one duplicates on the calendar. See #254.
34+
*
3035
* @param string $datetime UTC datetime string (e.g., "2026-01-04T02:30:00Z")
3136
* @param string $timezone Target IANA timezone (e.g., "America/Chicago")
3237
* @return array{date: string, time: string, timezone: string}
@@ -39,7 +44,18 @@ public static function parseUtc( string $datetime, string $timezone ): array {
3944
}
4045

4146
if ( ! self::isValidTimezone( $timezone ) ) {
42-
return $result;
47+
$fallback = self::siteTimezoneName();
48+
do_action(
49+
'datamachine_log',
50+
'warning',
51+
'DateTimeParser::parseUtc received empty/invalid timezone; falling back to site timezone',
52+
array(
53+
'datetime' => $datetime,
54+
'invalid_timezone' => $timezone,
55+
'fallback' => $fallback,
56+
)
57+
);
58+
$timezone = $fallback;
4359
}
4460

4561
try {
@@ -56,6 +72,32 @@ public static function parseUtc( string $datetime, string $timezone ): array {
5672
return $result;
5773
}
5874

75+
/**
76+
* Resolve a usable site timezone name for fallbacks.
77+
*
78+
* Prefers `wp_timezone()` when WordPress is loaded, otherwise falls back to
79+
* UTC so the parser still produces a stable result in non-WP contexts.
80+
*
81+
* @return string IANA timezone identifier
82+
*/
83+
private static function siteTimezoneName(): string {
84+
if ( function_exists( 'wp_timezone' ) ) {
85+
try {
86+
$tz = wp_timezone();
87+
if ( $tz instanceof DateTimeZone ) {
88+
$name = $tz->getName();
89+
if ( ! empty( $name ) ) {
90+
return $name;
91+
}
92+
}
93+
} catch ( Exception $e ) {
94+
// Fall through to UTC.
95+
}
96+
}
97+
98+
return 'UTC';
99+
}
100+
59101
/**
60102
* Parse local datetime that's already in venue timezone.
61103
*

inc/Steps/EventImport/Handlers/WebScraper/PageVenueExtractor.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,12 @@ public static function extractVenueName( string $html ): string {
381381
*/
382382
public static function extractTimezone( string $html ): string {
383383
// Squarespace context
384-
if ( preg_match( '/Static\.SQUARESPACE_CONTEXT\s*=\s*\{[^}]*"timeZone"\s*:\s*"([^"]+)"/s', $html, $matches ) ) {
384+
// Non-greedy .*? crosses nested {...} objects in SQUARESPACE_CONTEXT to reach
385+
// the first "timeZone" key (always inside the top-level `website` object). The
386+
// previous [^}]* form could not cross any `}` boundary, so it never matched
387+
// real Squarespace pages whose context JSON contains nested objects before
388+
// `website.timeZone` (e.g. betaFeatureFlags, rollups). See #254.
389+
if ( preg_match( '/Static\.SQUARESPACE_CONTEXT\s*=\s*\{.*?"timeZone"\s*:\s*"([^"]+)"/s', $html, $matches ) ) {
385390
return $matches[1];
386391
}
387392

tests/Fixtures/squarespace-royal-american.html

Lines changed: 9494 additions & 0 deletions
Large diffs are not rendered by default.

tests/Unit/DateTimeParserTest.php

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,28 @@ public function test_parse_utc_handles_different_timezones() {
3636
$this->assertEquals( '13:00', $la['time'] );
3737
}
3838

39-
public function test_parse_utc_returns_empty_for_invalid_timezone() {
39+
public function test_parse_utc_falls_back_to_site_timezone_for_invalid_timezone() {
40+
// Defense-in-depth fix from #254: invalid timezone must NOT silently destroy
41+
// the date — it must fall back to the site timezone so a missing venue
42+
// timezone cannot cascade into off-by-one duplicates.
4043
$result = DateTimeParser::parseUtc( '2026-01-15T18:00:00Z', 'Invalid/Timezone' );
4144

42-
$this->assertEquals( '', $result['date'] );
43-
$this->assertEquals( '', $result['time'] );
44-
$this->assertEquals( '', $result['timezone'] );
45+
$this->assertNotEmpty( $result['date'], 'parseUtc must not return an empty date when timezone is invalid' );
46+
$this->assertNotEmpty( $result['time'] );
47+
$this->assertNotEmpty( $result['timezone'], 'parseUtc must report the fallback timezone it used' );
48+
$this->assertTrue( DateTimeParser::isValidTimezone( $result['timezone'] ) );
49+
}
50+
51+
public function test_parse_utc_falls_back_to_site_timezone_for_empty_timezone() {
52+
// Royal American repro: Squarespace shows at 9pm Eastern land on the next
53+
// calendar day in UTC. With an empty timezone, the old code returned empty
54+
// and the caller picked a fragile fallback path. With the fix, parseUtc
55+
// falls back to the WP site timezone and still produces a stable date.
56+
$result = DateTimeParser::parseUtc( '2026-05-16T01:00:00Z', '' );
57+
58+
$this->assertNotEmpty( $result['date'], 'parseUtc must not silently drop the date when timezone is empty' );
59+
$this->assertNotEmpty( $result['timezone'] );
60+
$this->assertTrue( DateTimeParser::isValidTimezone( $result['timezone'] ) );
4561
}
4662

4763
public function test_parse_utc_returns_empty_for_empty_datetime() {
@@ -50,6 +66,16 @@ public function test_parse_utc_returns_empty_for_empty_datetime() {
5066
$this->assertEquals( '', $result['date'] );
5167
}
5268

69+
public function test_parse_utc_valid_timezone_path_unchanged() {
70+
// Regression guard: explicit valid timezone must continue to produce the
71+
// exact same output it did before the fallback was introduced.
72+
$result = DateTimeParser::parseUtc( '2026-05-16T01:00:00Z', 'America/New_York' );
73+
74+
$this->assertEquals( '2026-05-15', $result['date'] );
75+
$this->assertEquals( '21:00', $result['time'] );
76+
$this->assertEquals( 'America/New_York', $result['timezone'] );
77+
}
78+
5379
public function test_parse_local_preserves_datetime() {
5480
$result = DateTimeParser::parseLocal( '2026-01-15', '19:30', 'America/Denver' );
5581

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
<?php
2+
/**
3+
* PageVenueExtractor Tests
4+
*
5+
* Regression coverage for the Squarespace timezone regex fix (#254).
6+
*
7+
* @package DataMachineEvents\Tests\Unit
8+
*/
9+
10+
namespace DataMachineEvents\Tests\Unit;
11+
12+
use WP_UnitTestCase;
13+
use DataMachineEvents\Steps\EventImport\Handlers\WebScraper\PageVenueExtractor;
14+
15+
class PageVenueExtractorTest extends WP_UnitTestCase {
16+
17+
/**
18+
* Minimal HTML mimicking Squarespace's nested SQUARESPACE_CONTEXT JSON.
19+
*
20+
* The pre-#254 regex used `[^}]*` which cannot cross any `}` boundary, so
21+
* it could never match a real Squarespace page (whose context JSON always
22+
* has multiple nested objects before `website.timeZone`). This fixture
23+
* reproduces that nested shape with the absolute minimum nesting required
24+
* to trigger the bug.
25+
*/
26+
public function test_extractTimezone_matches_nested_squarespace_context() {
27+
$html = '<html><head><script>'
28+
. 'Static.SQUARESPACE_CONTEXT = {'
29+
. '"betaFeatureFlags":{"foo":true,"bar":1},'
30+
. '"rollups":{"x":{"y":1}},'
31+
. '"website":{"timeZone":"America/New_York","other":"x"}'
32+
. '};'
33+
. '</script></head><body></body></html>';
34+
35+
$this->assertEquals(
36+
'America/New_York',
37+
PageVenueExtractor::extractTimezone( $html ),
38+
'Squarespace context with nested objects before website.timeZone must still match.'
39+
);
40+
}
41+
42+
public function test_extractTimezone_matches_when_timezone_is_first_key() {
43+
// Simple shape (no nesting before timeZone) — should still match.
44+
$html = '<script>Static.SQUARESPACE_CONTEXT = {"website":{"timeZone":"America/Chicago"}};</script>';
45+
46+
$this->assertEquals( 'America/Chicago', PageVenueExtractor::extractTimezone( $html ) );
47+
}
48+
49+
public function test_extractTimezone_falls_back_to_generic_timezone_property() {
50+
// Non-Squarespace platforms expose a generic "timezone" JSON property —
51+
// the second regex branch must still hit when there's no SQUARESPACE_CONTEXT.
52+
$html = '<script>var config = {"timezone":"America/Denver"};</script>';
53+
54+
$this->assertEquals( 'America/Denver', PageVenueExtractor::extractTimezone( $html ) );
55+
}
56+
57+
public function test_extractTimezone_falls_back_to_meta_tag() {
58+
$html = '<html><head><meta name="timezone" content="Europe/London"></head></html>';
59+
60+
$this->assertEquals( 'Europe/London', PageVenueExtractor::extractTimezone( $html ) );
61+
}
62+
63+
public function test_extractTimezone_returns_empty_when_nothing_found() {
64+
$html = '<html><body>no timezone here</body></html>';
65+
66+
$this->assertEquals( '', PageVenueExtractor::extractTimezone( $html ) );
67+
}
68+
69+
public function test_extractTimezone_matches_royal_american_fixture() {
70+
// Snapshot of https://www.theroyalamerican.com/schedule (Squarespace 7.x)
71+
// captured during the #254 investigation. The full live HTML reproduces
72+
// the exact failure mode of the pre-fix regex — if this stops matching,
73+
// the regex has regressed.
74+
$fixture = __DIR__ . '/../Fixtures/squarespace-royal-american.html';
75+
76+
if ( ! file_exists( $fixture ) ) {
77+
$this->markTestSkipped( 'Royal American fixture not present (optional snapshot).' );
78+
}
79+
80+
$html = file_get_contents( $fixture );
81+
$this->assertEquals( 'America/New_York', PageVenueExtractor::extractTimezone( $html ) );
82+
}
83+
}

0 commit comments

Comments
 (0)