Skip to content

Commit ea7b4f8

Browse files
committed
fix(reviews): stabilize DOM extractor and capture full text
- Lead with structural/semantic anchors (data-review-id, ARIA roles, contributor URLs) before obfuscated class names, several of which (.CDe7pd, .wiI7pd.xwPlne, .WMbnJf, .bwb7ce, .dehysf) have rotated. - Detect owner response via localized "Response from the owner" header regex (en/fr/es/de/it/pt/nl/pl/da/sv/tr) instead of class selectors. - Click "More"/"Voir plus" expanders globally and await 800ms for the animated DOM update before reading textContent. Synchronous reads were silently capturing truncated text for long reviews/replies. - Dedupe nested empty <div data-review-id> markers that double-counted every review. - Rating extraction adds plain-text "N/5" fallback for the new format Google now uses; star aria-label remains primary. - Time extraction uses regex with negative lookbehind so a rating glued to the time string ("4/53 months ago") no longer poisons the match. - Fix uninitialized patterns map in extractPlaceID that would panic on the first call. - Add ReviewId field to Review struct, populated from RPC JSON-array paths when available.
1 parent 2cd5745 commit ea7b4f8

2 files changed

Lines changed: 207 additions & 69 deletions

File tree

gmaps/entry.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ type Review struct {
5757
When string
5858
OwnerResponse string
5959
OwnerResponseTime string
60+
ReviewId string
6061
}
6162

6263
type Entry struct {
@@ -467,6 +468,33 @@ func parseReviews(reviewsI []any) []Review {
467468
}
468469
}
469470

471+
// Extract Review ID - Google stores contribution IDs at various paths.
472+
// The ID is a base64-encoded string starting with "Ch" (e.g., ChZDSUhNMG9nS0VJQ0FnSURzekpXdGNREAE).
473+
reviewId := ""
474+
reviewIdCandidatePaths := [][]int{
475+
{4}, // [el][4] - common location
476+
{0, 4}, // [el][0][4]
477+
{1, 0, 4}, // author info area
478+
{1, 4, 0}, // [1][4][0]
479+
{1, 4, 6}, // [1][4][6]
480+
{1, 4, 3}, // [1][4][3]
481+
{1, 0}, // contribution reference
482+
{2, 2, 0, 1, 0}, // timestamp/metadata area
483+
}
484+
for _, path := range reviewIdCandidatePaths {
485+
candidate := getNthElementAndCast[string](el, path...)
486+
if strings.HasPrefix(candidate, "Ch") {
487+
reviewId = candidate
488+
break
489+
}
490+
}
491+
if reviewId == "" {
492+
candidate := getNthElementAndCast[string](reviewsI, i, 4)
493+
if strings.HasPrefix(candidate, "Ch") {
494+
reviewId = candidate
495+
}
496+
}
497+
470498
// Try multiple paths for the timestamp
471499
time := getNthElementAndCast[[]any](el, 2, 2, 0, 1, 21, 6, 8)
472500
if len(time) == 0 {
@@ -528,6 +556,7 @@ func parseReviews(reviewsI []any) []Review {
528556
Description: description,
529557
OwnerResponse: ownerResponse,
530558
OwnerResponseTime: ownerResponseTime,
559+
ReviewId: reviewId,
531560
}
532561

533562
if review.Name == "" {

gmaps/reviews.go

Lines changed: 178 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ const hexMatchPattern = `0x[0-9a-fA-F]+:0x[0-9a-fA-F]+` // Hex format place ID
198198
// extractPlaceID extracts the place ID from various Google Maps URL formats
199199
func extractPlaceID(mapURL string) (string, error) {
200200
patternsOnce.Do(func() {
201+
// Initialize the map
202+
patterns = make(map[string]*regexp.Regexp)
203+
201204
// Try multiple patterns for extracting place ID
202205
avail := []string{
203206
`!1s([^!]+)`, // Standard format: !1s0x...
@@ -429,27 +432,50 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
429432
}
430433

431434
// Extract reviews from the DOM - updated for Dec 2025 Google Maps structure
432-
reviewsJSON, err := page.Eval(`() => {
435+
reviewsJSON, err := page.Eval(`async () => {
433436
try {
434437
const reviews = [];
435438
436-
// Try multiple selectors for review container elements
437-
// Google Maps uses various class names that change over time
439+
// Pre-pass: click EVERY "More"/"Voir plus"/"Mehr"/etc. button across
440+
// the reviews panel to expand both review text AND owner responses.
441+
// Google's expansion is animated/async — clicking and reading in the
442+
// same synchronous tick reads the still-truncated text. We click all
443+
// expanders first, then await one frame + a short timeout, then read.
444+
document.querySelectorAll(
445+
'button[aria-expanded="false"], button[aria-label*="More" i], button[jsaction*="expand"], .w8nwRe'
446+
).forEach(b => { try { b.click(); } catch(e) {} });
447+
await new Promise(r => setTimeout(r, 800));
448+
449+
// Selector priority: stable structural/semantic anchors first
450+
// (data-attributes, aria-*, role) then class names as fallback,
451+
// since Google's obfuscated class names rotate frequently.
438452
const reviewSelectors = [
439-
'.jftiEf', // Common review container
440-
'div[data-review-id]', // Review with ID attribute
441-
'.gws-localreviews__google-review', // Alternative format
453+
'div[data-review-id]', // STABLE: review id attribute
454+
'[jsaction*="review"]', // STABLE: jsaction-bound review block
455+
'.jftiEf', // Class fallback
456+
'.gws-localreviews__google-review', // Class fallback
442457
'[data-hveid] .review-dialog-list > div', // Search results reviews
443-
'.WMbnJf', // Another review container
444-
'.bwb7ce', // New review format
458+
'.WMbnJf', // Class fallback
459+
'.bwb7ce', // Class fallback
445460
];
446461
447462
let reviewElements = [];
448463
for (const selector of reviewSelectors) {
449464
const elements = document.querySelectorAll(selector);
450465
if (elements && elements.length > 0) {
451466
reviewElements = Array.from(elements);
452-
console.log('Found reviews with selector:', selector, 'count:', elements.length);
467+
// Dedupe: Google nests an empty <div data-review-id="..."> inside
468+
// each outer review wrapper, which double-counts. Drop any element
469+
// whose ancestor matches the same selector.
470+
reviewElements = reviewElements.filter(el => {
471+
let p = el.parentElement;
472+
while (p) {
473+
if (p.matches && p.matches(selector)) return false;
474+
p = p.parentElement;
475+
}
476+
return true;
477+
});
478+
console.log('Found reviews with selector:', selector, 'count:', reviewElements.length);
453479
break;
454480
}
455481
}
@@ -471,33 +497,45 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
471497
472498
for (const element of reviewElements) {
473499
try {
474-
// Author name - comprehensive selectors
500+
// Author name - lead with semantic anchors (contributor links,
501+
// aria-label on buttons), fall back to class names.
475502
const userSelectors = [
476-
'.d4r55', // Primary name class
477-
'.WNxzHc', // Alternative name
478-
'.TSUbDb a', // Link with name
479-
'.review-author', // Generic
480-
'button.al6Kxe', // Clickable name
481-
'.bHrnEe', // Another name container
503+
'a[href*="/maps/contrib/"]', // STABLE: Maps contributor profile URL
504+
'button[data-href*="/contrib/"]', // STABLE: contributor button
505+
'[aria-label*="Photo of"]', // STABLE: localized "Photo of <name>"
506+
'.d4r55', // Class fallback
507+
'.WNxzHc', // Class fallback
508+
'.TSUbDb a', // Class fallback
509+
'.review-author',
510+
'button.al6Kxe',
511+
'.bHrnEe',
482512
];
483513
let userName = '';
484514
let userUrl = '';
485515
for (const sel of userSelectors) {
486516
const el = element.querySelector(sel);
487517
if (el) {
488518
userName = el.textContent?.trim() || '';
519+
// Strip localized "Photo of " prefix if pulled from aria-label.
520+
const aria = el.getAttribute('aria-label') || '';
521+
if (!userName && aria) {
522+
userName = aria.replace(/^[^:]*\s+(?:of|de|von|di|do|da)\s+/i, '').trim();
523+
}
489524
if (el.tagName?.toLowerCase() === 'a') {
490525
userUrl = el.getAttribute('href') || '';
526+
} else if (el.getAttribute('data-href')) {
527+
userUrl = el.getAttribute('data-href') || '';
491528
}
492529
if (userName) break;
493530
}
494531
}
495532
496-
// Profile picture - multiple patterns
533+
// Profile picture - lead with src host (stable hostnames)
534+
// before class names.
497535
const profilePicSelectors = [
498-
'.NBa7we',
499-
'img[src*="googleusercontent"]',
500-
'img[src*="lh3.google"]',
536+
'img[src*="googleusercontent"]', // STABLE: Google CDN host
537+
'img[src*="lh3.google"]', // STABLE: Google CDN host
538+
'.NBa7we', // Class fallback
501539
'.review-author-photo img',
502540
];
503541
let profilePic = '';
@@ -509,66 +547,87 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
509547
}
510548
}
511549
512-
// Rating - try multiple approaches
550+
// Rating - try aria-label first (older format / a11y tools),
551+
// then text pattern N/5 which Google now renders as plain text
552+
// inside the metadata block (e.g. "4/5", "5/5").
513553
let rating = 0;
514554
const ratingSelectors = [
555+
'[role="img"][aria-label*="star" i]',
556+
'[role="img"][aria-label*="étoile" i]',
557+
'[role="img"][aria-label*="estrella" i]',
558+
'[role="img"][aria-label*="stern" i]',
559+
'[role="img"][aria-label]',
560+
'span[aria-label*="star" i]',
515561
'.kvMYJc',
516562
'.DU9Pgb span[aria-label]',
517-
'[role="img"][aria-label*="star"]',
518563
'.pjemBf span',
519564
'.review-score',
520565
];
521566
for (const sel of ratingSelectors) {
522567
const ratingEl = element.querySelector(sel);
523568
if (ratingEl) {
524569
const ariaLabel = ratingEl.getAttribute('aria-label') || '';
525-
// Match patterns like "5 stars", "Rated 4 out of 5", "4.5 étoiles"
526570
const match = ariaLabel.match(/(\d+(?:\.\d+)?)/);
527-
if (match) {
528-
rating = Math.round(parseFloat(match[1])) || 0;
529-
break;
530-
}
531-
// Also try counting filled stars
532-
const filledStars = element.querySelectorAll('.hCCjke.vzX5Ic, [aria-label*="star"][style*="color"]').length;
533-
if (filledStars > 0) {
534-
rating = filledStars;
535-
break;
536-
}
571+
if (match) { rating = Math.round(parseFloat(match[1])) || 0; break; }
572+
}
573+
}
574+
// Text-format fallback: the modern review card renders the rating
575+
// as "4/5" in plain text. Scan short text nodes for that pattern.
576+
if (rating === 0) {
577+
const textNodes = element.querySelectorAll('span, div');
578+
for (const n of textNodes) {
579+
const t = (n.textContent || '').trim();
580+
if (t.length > 8) continue;
581+
const m = t.match(/^(\d+(?:\.\d+)?)\s*\/\s*5$/);
582+
if (m) { rating = Math.round(parseFloat(m[1])) || 0; break; }
537583
}
538584
}
539585
540-
// Time/date - multiple selectors
541-
const timeSelectors = ['.rsqaWe', '.DU9Pgb', '.tTVLSc', '.review-date', '.dehysf'];
586+
// Time/date: extract a relative-time substring like "3 months ago"
587+
// from anywhere in the review's text content. Using a regex
588+
// extract (not a node-text match) sidesteps the problem that
589+
// Google's rating + time render as sibling spans inside one
590+
// parent whose concatenated textContent looks like
591+
// "4/53 months ago on Google" with no separator.
592+
// Negative lookbehind rejects digits/slash before the number,
593+
// otherwise "4/53 months ago" (rating concatenated with time)
594+
// would match "53 months ago". Also covers French ("il y a 2 mois").
595+
const timeExtractRegex = /(?<![\d\/])(\d+\s+(?:year|month|week|day|hour|minute)s?\s+ago|a\s+(?:year|month|week|day|hour|minute)\s+ago|just\s+now|yesterday|today|il\s+y\s+a\s+(?:un|une|\d+)\s+(?:an|mois|semaine|jour|heure|minute)s?)\b/i;
542596
let relativeTime = '';
543-
for (const sel of timeSelectors) {
544-
const el = element.querySelector(sel);
545-
if (el) {
546-
const text = el.textContent?.trim() || '';
547-
// Look for time-related text (ago, month, year, etc)
548-
if (text && (text.includes('ago') || text.includes('week') || text.includes('month') ||
549-
text.includes('year') || text.includes('day') || text.match(/\d{4}/))) {
550-
relativeTime = text;
551-
break;
597+
const timeNodes = element.querySelectorAll('span, div');
598+
for (const n of timeNodes) {
599+
const text = (n.textContent || '').trim();
600+
if (!text || text.length > 200) continue;
601+
const m = text.match(timeExtractRegex);
602+
if (m) { relativeTime = m[0]; break; }
603+
}
604+
// Class-based fallbacks if text scan missed it.
605+
if (!relativeTime) {
606+
const timeSelectors = ['.rsqaWe', '.DU9Pgb', '.tTVLSc', '.review-date', '.dehysf'];
607+
for (const sel of timeSelectors) {
608+
const el = element.querySelector(sel);
609+
if (el) {
610+
const text = (el.textContent || '').trim();
611+
const m = text.match(timeExtractRegex);
612+
if (m) { relativeTime = m[0]; break; }
552613
}
553614
}
554615
}
555616
556-
// Review text - try to expand and get full text
617+
// Review text - try to expand and get full text. Lead with
618+
// data-* anchors then class names.
557619
const textSelectors = [
558-
'.wiI7pd',
620+
'[data-expandable-section] span', // STABLE: data attribute
621+
'span[jsname]', // STABLE: jsname binding
622+
'.wiI7pd', // Class fallback (long-lived)
559623
'.MyEned span',
560624
'.review-full-text',
561625
'.Jtu6Td span',
562-
'[data-expandable-section] span',
563626
];
564627
let text = '';
565628
566-
// First try to click "More" button to expand text
567-
const moreButtons = element.querySelectorAll('.w8nwRe, button[aria-label*="More"], button[aria-expanded="false"]');
568-
for (const btn of moreButtons) {
569-
try { btn.click(); } catch(e) {}
570-
}
571-
629+
// "More" buttons were already clicked + awaited in the pre-pass
630+
// at the top of this function, so text is already expanded.
572631
for (const sel of textSelectors) {
573632
const textEl = element.querySelector(sel);
574633
if (textEl) {
@@ -587,29 +646,79 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
587646
}
588647
}
589648
590-
// Owner response - look for response container within the review
649+
// Owner response - prefer text-content / aria detection
650+
// over class names (most fragile selectors in this script).
651+
// Google localizes the "Response from the owner" header
652+
// across languages but always renders it as a header element
653+
// inside the review block.
591654
let ownerResponse = '';
592655
let ownerResponseTime = '';
593-
const responseSelectors = [
594-
'.CDe7pd', // Owner response container
595-
'.wiI7pd.xwPlne', // Alternative response text
596-
'.review-response', // Generic
597-
'.owner-response', // Generic
598-
];
599-
for (const sel of responseSelectors) {
600-
const responseEl = element.querySelector(sel);
601-
if (responseEl) {
602-
ownerResponse = responseEl.textContent?.trim() || '';
603-
// Try to find response time nearby
604-
const responseTimeEl = responseEl.closest('.review-response-container')?.querySelector('.rsqaWe') ||
605-
responseEl.parentElement?.querySelector('.rsqaWe, .dehysf');
606-
if (responseTimeEl) {
607-
ownerResponseTime = responseTimeEl.textContent?.trim() || '';
656+
657+
// Strategy 1 (semantic): find a header-like element whose text
658+
// matches a localized "Response from the owner" prefix, then
659+
// the response body is its sibling/next text node.
660+
const ownerHeaderRegex = /^(response from the owner|owner['']s reply|response from owner|réponse du propriétaire|respuesta del propietario|antwort des inhabers|risposta del proprietario|resposta do proprietário|antwoord van de eigenaar|odpowiedź właściciela|svar fra ejeren|ägarens svar|sahibinden yanıt)/i;
661+
const candidateHeaders = element.querySelectorAll('div, span');
662+
for (const h of candidateHeaders) {
663+
const t = (h.textContent || '').trim();
664+
// Header is short — full reply is in a different element.
665+
if (t.length > 0 && t.length < 80 && ownerHeaderRegex.test(t)) {
666+
// The response body is a sibling text container after the header.
667+
const container = h.parentElement;
668+
if (container) {
669+
// Pick the longest text descendant of the container that
670+
// isn't the header itself — that's the response body.
671+
let best = '';
672+
container.querySelectorAll('span, div').forEach(c => {
673+
if (c === h) return;
674+
const tt = (c.textContent || '').trim();
675+
if (tt.length > best.length && !ownerHeaderRegex.test(tt) && tt !== text) {
676+
best = tt;
677+
}
678+
});
679+
ownerResponse = best;
680+
}
681+
// Response time: extract via regex anywhere in the header
682+
// or its sibling nodes. The header text often looks like
683+
// "Response from the owner 3 months ago".
684+
const respTimeExtract = /(?<![\d\/])(\d+\s+(?:year|month|week|day|hour|minute)s?\s+ago|a\s+(?:year|month|week|day|hour|minute)\s+ago|il\s+y\s+a\s+(?:un|une|\d+)\s+(?:an|mois|semaine|jour|heure|minute)s?)\b/i;
685+
const headerText = (h.textContent || '').trim();
686+
const headerMatch = headerText.match(respTimeExtract);
687+
if (headerMatch) ownerResponseTime = headerMatch[0];
688+
if (!ownerResponseTime && h.parentElement) {
689+
for (const c of h.parentElement.children) {
690+
if (c === h) continue;
691+
const tt = (c.textContent || '').trim();
692+
const m = tt.match(respTimeExtract);
693+
if (m) { ownerResponseTime = m[0]; break; }
694+
}
608695
}
609696
if (ownerResponse) break;
610697
}
611698
}
612699
700+
// Strategy 2 (class fallback): class-based selectors.
701+
if (!ownerResponse) {
702+
const responseSelectors = [
703+
'.CDe7pd', // Owner response container
704+
'.wiI7pd.xwPlne', // Alternative response text
705+
'.review-response',
706+
'.owner-response',
707+
];
708+
for (const sel of responseSelectors) {
709+
const responseEl = element.querySelector(sel);
710+
if (responseEl) {
711+
ownerResponse = responseEl.textContent?.trim() || '';
712+
const responseTimeEl = responseEl.closest('.review-response-container')?.querySelector('.rsqaWe') ||
713+
responseEl.parentElement?.querySelector('.rsqaWe, .dehysf');
714+
if (responseTimeEl) {
715+
ownerResponseTime = responseTimeEl.textContent?.trim() || '';
716+
}
717+
if (ownerResponse) break;
718+
}
719+
}
720+
}
721+
613722
if (userName && (text || rating > 0)) {
614723
reviews.push({
615724
author_name: userName,

0 commit comments

Comments
 (0)