@@ -198,6 +198,9 @@ const hexMatchPattern = `0x[0-9a-fA-F]+:0x[0-9a-fA-F]+` // Hex format place ID
198198// extractPlaceID extracts the place ID from various Google Maps URL formats
199199func extractPlaceID (mapURL string ) (string , error ) {
200200 patternsOnce .Do (func () {
201+ // Initialize the map
202+ patterns = make (map [string ]* regexp.Regexp )
203+
201204 // Try multiple patterns for extracting place ID
202205 avail := []string {
203206 `!1s([^!]+)` , // Standard format: !1s0x...
@@ -429,27 +432,50 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
429432 }
430433
431434 // Extract reviews from the DOM - updated for Dec 2025 Google Maps structure
432- reviewsJSON , err := page .Eval (`() => {
435+ reviewsJSON , err := page .Eval (`async () => {
433436 try {
434437 const reviews = [];
435438
436- // Try multiple selectors for review container elements
437- // Google Maps uses various class names that change over time
439+ // Pre-pass: click EVERY "More"/"Voir plus"/"Mehr"/etc. button across
440+ // the reviews panel to expand both review text AND owner responses.
441+ // Google's expansion is animated/async — clicking and reading in the
442+ // same synchronous tick reads the still-truncated text. We click all
443+ // expanders first, then await one frame + a short timeout, then read.
444+ document.querySelectorAll(
445+ 'button[aria-expanded="false"], button[aria-label*="More" i], button[jsaction*="expand"], .w8nwRe'
446+ ).forEach(b => { try { b.click(); } catch(e) {} });
447+ await new Promise(r => setTimeout(r, 800));
448+
449+ // Selector priority: stable structural/semantic anchors first
450+ // (data-attributes, aria-*, role) then class names as fallback,
451+ // since Google's obfuscated class names rotate frequently.
438452 const reviewSelectors = [
439- '.jftiEf', // Common review container
440- 'div[data-review-id]', // Review with ID attribute
441- '.gws-localreviews__google-review', // Alternative format
453+ 'div[data-review-id]', // STABLE: review id attribute
454+ '[jsaction*="review"]', // STABLE: jsaction-bound review block
455+ '.jftiEf', // Class fallback
456+ '.gws-localreviews__google-review', // Class fallback
442457 '[data-hveid] .review-dialog-list > div', // Search results reviews
443- '.WMbnJf', // Another review container
444- '.bwb7ce', // New review format
458+ '.WMbnJf', // Class fallback
459+ '.bwb7ce', // Class fallback
445460 ];
446461
447462 let reviewElements = [];
448463 for (const selector of reviewSelectors) {
449464 const elements = document.querySelectorAll(selector);
450465 if (elements && elements.length > 0) {
451466 reviewElements = Array.from(elements);
452- console.log('Found reviews with selector:', selector, 'count:', elements.length);
467+ // Dedupe: Google nests an empty <div data-review-id="..."> inside
468+ // each outer review wrapper, which double-counts. Drop any element
469+ // whose ancestor matches the same selector.
470+ reviewElements = reviewElements.filter(el => {
471+ let p = el.parentElement;
472+ while (p) {
473+ if (p.matches && p.matches(selector)) return false;
474+ p = p.parentElement;
475+ }
476+ return true;
477+ });
478+ console.log('Found reviews with selector:', selector, 'count:', reviewElements.length);
453479 break;
454480 }
455481 }
@@ -471,33 +497,45 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
471497
472498 for (const element of reviewElements) {
473499 try {
474- // Author name - comprehensive selectors
500+ // Author name - lead with semantic anchors (contributor links,
501+ // aria-label on buttons), fall back to class names.
475502 const userSelectors = [
476- '.d4r55', // Primary name class
477- '.WNxzHc', // Alternative name
478- '.TSUbDb a', // Link with name
479- '.review-author', // Generic
480- 'button.al6Kxe', // Clickable name
481- '.bHrnEe', // Another name container
503+ 'a[href*="/maps/contrib/"]', // STABLE: Maps contributor profile URL
504+ 'button[data-href*="/contrib/"]', // STABLE: contributor button
505+ '[aria-label*="Photo of"]', // STABLE: localized "Photo of <name>"
506+ '.d4r55', // Class fallback
507+ '.WNxzHc', // Class fallback
508+ '.TSUbDb a', // Class fallback
509+ '.review-author',
510+ 'button.al6Kxe',
511+ '.bHrnEe',
482512 ];
483513 let userName = '';
484514 let userUrl = '';
485515 for (const sel of userSelectors) {
486516 const el = element.querySelector(sel);
487517 if (el) {
488518 userName = el.textContent?.trim() || '';
519+ // Strip localized "Photo of " prefix if pulled from aria-label.
520+ const aria = el.getAttribute('aria-label') || '';
521+ if (!userName && aria) {
522+ userName = aria.replace(/^[^:]*\s+(?:of|de|von|di|do|da)\s+/i, '').trim();
523+ }
489524 if (el.tagName?.toLowerCase() === 'a') {
490525 userUrl = el.getAttribute('href') || '';
526+ } else if (el.getAttribute('data-href')) {
527+ userUrl = el.getAttribute('data-href') || '';
491528 }
492529 if (userName) break;
493530 }
494531 }
495532
496- // Profile picture - multiple patterns
533+ // Profile picture - lead with src host (stable hostnames)
534+ // before class names.
497535 const profilePicSelectors = [
498- '.NBa7we',
499- 'img[src*="googleusercontent "]',
500- 'img[src*="lh3.google"]',
536+ 'img[src*="googleusercontent"]', // STABLE: Google CDN host
537+ 'img[src*="lh3.google "]', // STABLE: Google CDN host
538+ '.NBa7we', // Class fallback
501539 '.review-author-photo img',
502540 ];
503541 let profilePic = '';
@@ -509,66 +547,87 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
509547 }
510548 }
511549
512- // Rating - try multiple approaches
550+ // Rating - try aria-label first (older format / a11y tools),
551+ // then text pattern N/5 which Google now renders as plain text
552+ // inside the metadata block (e.g. "4/5", "5/5").
513553 let rating = 0;
514554 const ratingSelectors = [
555+ '[role="img"][aria-label*="star" i]',
556+ '[role="img"][aria-label*="étoile" i]',
557+ '[role="img"][aria-label*="estrella" i]',
558+ '[role="img"][aria-label*="stern" i]',
559+ '[role="img"][aria-label]',
560+ 'span[aria-label*="star" i]',
515561 '.kvMYJc',
516562 '.DU9Pgb span[aria-label]',
517- '[role="img"][aria-label*="star"]',
518563 '.pjemBf span',
519564 '.review-score',
520565 ];
521566 for (const sel of ratingSelectors) {
522567 const ratingEl = element.querySelector(sel);
523568 if (ratingEl) {
524569 const ariaLabel = ratingEl.getAttribute('aria-label') || '';
525- // Match patterns like "5 stars", "Rated 4 out of 5", "4.5 étoiles"
526570 const match = ariaLabel.match(/(\d+(?:\.\d+)?)/);
527- if (match) {
528- rating = Math.round(parseFloat(match[1])) || 0;
529- break;
530- }
531- // Also try counting filled stars
532- const filledStars = element.querySelectorAll('.hCCjke.vzX5Ic, [aria-label*="star"][style*="color"]').length;
533- if (filledStars > 0) {
534- rating = filledStars;
535- break;
536- }
571+ if (match) { rating = Math.round(parseFloat(match[1])) || 0; break; }
572+ }
573+ }
574+ // Text-format fallback: the modern review card renders the rating
575+ // as "4/5" in plain text. Scan short text nodes for that pattern.
576+ if (rating === 0) {
577+ const textNodes = element.querySelectorAll('span, div');
578+ for (const n of textNodes) {
579+ const t = (n.textContent || '').trim();
580+ if (t.length > 8) continue;
581+ const m = t.match(/^(\d+(?:\.\d+)?)\s*\/\s*5$/);
582+ if (m) { rating = Math.round(parseFloat(m[1])) || 0; break; }
537583 }
538584 }
539585
540- // Time/date - multiple selectors
541- const timeSelectors = ['.rsqaWe', '.DU9Pgb', '.tTVLSc', '.review-date', '.dehysf'];
586+ // Time/date: extract a relative-time substring like "3 months ago"
587+ // from anywhere in the review's text content. Using a regex
588+ // extract (not a node-text match) sidesteps the problem that
589+ // Google's rating + time render as sibling spans inside one
590+ // parent whose concatenated textContent looks like
591+ // "4/53 months ago on Google" with no separator.
592+ // Negative lookbehind rejects digits/slash before the number,
593+ // otherwise "4/53 months ago" (rating concatenated with time)
594+ // would match "53 months ago". Also covers French ("il y a 2 mois").
595+ const timeExtractRegex = /(?<![\d\/])(\d+\s+(?:year|month|week|day|hour|minute)s?\s+ago|a\s+(?:year|month|week|day|hour|minute)\s+ago|just\s+now|yesterday|today|il\s+y\s+a\s+(?:un|une|\d+)\s+(?:an|mois|semaine|jour|heure|minute)s?)\b/i;
542596 let relativeTime = '';
543- for (const sel of timeSelectors) {
544- const el = element.querySelector(sel);
545- if (el) {
546- const text = el.textContent?.trim() || '';
547- // Look for time-related text (ago, month, year, etc)
548- if (text && (text.includes('ago') || text.includes('week') || text.includes('month') ||
549- text.includes('year') || text.includes('day') || text.match(/\d{4}/))) {
550- relativeTime = text;
551- break;
597+ const timeNodes = element.querySelectorAll('span, div');
598+ for (const n of timeNodes) {
599+ const text = (n.textContent || '').trim();
600+ if (!text || text.length > 200) continue;
601+ const m = text.match(timeExtractRegex);
602+ if (m) { relativeTime = m[0]; break; }
603+ }
604+ // Class-based fallbacks if text scan missed it.
605+ if (!relativeTime) {
606+ const timeSelectors = ['.rsqaWe', '.DU9Pgb', '.tTVLSc', '.review-date', '.dehysf'];
607+ for (const sel of timeSelectors) {
608+ const el = element.querySelector(sel);
609+ if (el) {
610+ const text = (el.textContent || '').trim();
611+ const m = text.match(timeExtractRegex);
612+ if (m) { relativeTime = m[0]; break; }
552613 }
553614 }
554615 }
555616
556- // Review text - try to expand and get full text
617+ // Review text - try to expand and get full text. Lead with
618+ // data-* anchors then class names.
557619 const textSelectors = [
558- '.wiI7pd',
620+ '[data-expandable-section] span', // STABLE: data attribute
621+ 'span[jsname]', // STABLE: jsname binding
622+ '.wiI7pd', // Class fallback (long-lived)
559623 '.MyEned span',
560624 '.review-full-text',
561625 '.Jtu6Td span',
562- '[data-expandable-section] span',
563626 ];
564627 let text = '';
565628
566- // First try to click "More" button to expand text
567- const moreButtons = element.querySelectorAll('.w8nwRe, button[aria-label*="More"], button[aria-expanded="false"]');
568- for (const btn of moreButtons) {
569- try { btn.click(); } catch(e) {}
570- }
571-
629+ // "More" buttons were already clicked + awaited in the pre-pass
630+ // at the top of this function, so text is already expanded.
572631 for (const sel of textSelectors) {
573632 const textEl = element.querySelector(sel);
574633 if (textEl) {
@@ -587,29 +646,79 @@ func extractReviewsFromPage(ctx context.Context, page scrapemate.BrowserPage) ([
587646 }
588647 }
589648
590- // Owner response - look for response container within the review
649+ // Owner response - prefer text-content / aria detection
650+ // over class names (most fragile selectors in this script).
651+ // Google localizes the "Response from the owner" header
652+ // across languages but always renders it as a header element
653+ // inside the review block.
591654 let ownerResponse = '';
592655 let ownerResponseTime = '';
593- const responseSelectors = [
594- '.CDe7pd', // Owner response container
595- '.wiI7pd.xwPlne', // Alternative response text
596- '.review-response', // Generic
597- '.owner-response', // Generic
598- ];
599- for (const sel of responseSelectors) {
600- const responseEl = element.querySelector(sel);
601- if (responseEl) {
602- ownerResponse = responseEl.textContent?.trim() || '';
603- // Try to find response time nearby
604- const responseTimeEl = responseEl.closest('.review-response-container')?.querySelector('.rsqaWe') ||
605- responseEl.parentElement?.querySelector('.rsqaWe, .dehysf');
606- if (responseTimeEl) {
607- ownerResponseTime = responseTimeEl.textContent?.trim() || '';
656+
657+ // Strategy 1 (semantic): find a header-like element whose text
658+ // matches a localized "Response from the owner" prefix, then
659+ // the response body is its sibling/next text node.
660+ const ownerHeaderRegex = /^(response from the owner|owner['']s reply|response from owner|réponse du propriétaire|respuesta del propietario|antwort des inhabers|risposta del proprietario|resposta do proprietário|antwoord van de eigenaar|odpowiedź właściciela|svar fra ejeren|ägarens svar|sahibinden yanıt)/i;
661+ const candidateHeaders = element.querySelectorAll('div, span');
662+ for (const h of candidateHeaders) {
663+ const t = (h.textContent || '').trim();
664+ // Header is short — full reply is in a different element.
665+ if (t.length > 0 && t.length < 80 && ownerHeaderRegex.test(t)) {
666+ // The response body is a sibling text container after the header.
667+ const container = h.parentElement;
668+ if (container) {
669+ // Pick the longest text descendant of the container that
670+ // isn't the header itself — that's the response body.
671+ let best = '';
672+ container.querySelectorAll('span, div').forEach(c => {
673+ if (c === h) return;
674+ const tt = (c.textContent || '').trim();
675+ if (tt.length > best.length && !ownerHeaderRegex.test(tt) && tt !== text) {
676+ best = tt;
677+ }
678+ });
679+ ownerResponse = best;
680+ }
681+ // Response time: extract via regex anywhere in the header
682+ // or its sibling nodes. The header text often looks like
683+ // "Response from the owner 3 months ago".
684+ const respTimeExtract = /(?<![\d\/])(\d+\s+(?:year|month|week|day|hour|minute)s?\s+ago|a\s+(?:year|month|week|day|hour|minute)\s+ago|il\s+y\s+a\s+(?:un|une|\d+)\s+(?:an|mois|semaine|jour|heure|minute)s?)\b/i;
685+ const headerText = (h.textContent || '').trim();
686+ const headerMatch = headerText.match(respTimeExtract);
687+ if (headerMatch) ownerResponseTime = headerMatch[0];
688+ if (!ownerResponseTime && h.parentElement) {
689+ for (const c of h.parentElement.children) {
690+ if (c === h) continue;
691+ const tt = (c.textContent || '').trim();
692+ const m = tt.match(respTimeExtract);
693+ if (m) { ownerResponseTime = m[0]; break; }
694+ }
608695 }
609696 if (ownerResponse) break;
610697 }
611698 }
612699
700+ // Strategy 2 (class fallback): class-based selectors.
701+ if (!ownerResponse) {
702+ const responseSelectors = [
703+ '.CDe7pd', // Owner response container
704+ '.wiI7pd.xwPlne', // Alternative response text
705+ '.review-response',
706+ '.owner-response',
707+ ];
708+ for (const sel of responseSelectors) {
709+ const responseEl = element.querySelector(sel);
710+ if (responseEl) {
711+ ownerResponse = responseEl.textContent?.trim() || '';
712+ const responseTimeEl = responseEl.closest('.review-response-container')?.querySelector('.rsqaWe') ||
713+ responseEl.parentElement?.querySelector('.rsqaWe, .dehysf');
714+ if (responseTimeEl) {
715+ ownerResponseTime = responseTimeEl.textContent?.trim() || '';
716+ }
717+ if (ownerResponse) break;
718+ }
719+ }
720+ }
721+
613722 if (userName && (text || rating > 0)) {
614723 reviews.push({
615724 author_name: userName,
0 commit comments