Skip to content

Commit f2f5476

Browse files
committed
fix(instagram-api): XHR Accounts Center + profile-capture wedge check
Ports additional improvements from context-gateway side that went in after the initial port on this branch. Matches the code merged in: - CG #187 (profile capture wedge check): short-circuit profile-capture polling loop on runtime wedge. - CG #188 (XHR Accounts Center): fetch data-sjs HTML blocks via same- origin XHR from an instagram.com anchor page instead of navigating the remote browser to each heavy Accounts Center page. Verified on staging parallel full-mode runs: acct 3 got first advertiser via XHR-fetched ads page; acct 1 consistently collects profile + posts + followers + following + advertisers with this code.
1 parent 5698caa commit f2f5476

1 file changed

Lines changed: 172 additions & 76 deletions

File tree

connectors/meta/instagram-api-playwright.js

Lines changed: 172 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ const markWedgeIfMatches = (err) => {
170170
const msg = err?.message || String(err);
171171
if (/Command timeout:/i.test(msg)) runtimeWedgeDetected = true;
172172
};
173+
173174
const setAuthState = async (state) => {
174175
latestAuthState = state;
175176
};
@@ -1040,22 +1041,33 @@ const handlePostLoginChallenge = async () => {
10401041
"confirm it's you",
10411042
]);
10421043
if (affirm) {
1043-
await page.evaluate(`
1044-
(() => {
1045-
const target = ${JSON.stringify(affirm)};
1046-
const btns = Array.from(document.querySelectorAll('button, [role="button"]'));
1047-
const match = btns.find((b) => (b.textContent || '').trim() === target);
1048-
if (!match) return;
1049-
try { match.focus(); } catch (_) {}
1050-
const mouseOpts = { bubbles: true, cancelable: true, view: window };
1051-
try { match.dispatchEvent(new MouseEvent('pointerdown', mouseOpts)); } catch (_) {}
1052-
try { match.dispatchEvent(new MouseEvent('mousedown', mouseOpts)); } catch (_) {}
1053-
try { match.dispatchEvent(new MouseEvent('pointerup', mouseOpts)); } catch (_) {}
1054-
try { match.dispatchEvent(new MouseEvent('mouseup', mouseOpts)); } catch (_) {}
1055-
try { match.dispatchEvent(new MouseEvent('click', mouseOpts)); } catch (_) {}
1056-
try { match.click(); } catch (_) {}
1057-
})()
1058-
`);
1044+
const escaped = affirm.replace(/"/g, '\\"');
1045+
let clickedViaPage = false;
1046+
try {
1047+
await page.click(
1048+
'button:has-text("' + escaped + '"), [role="button"]:has-text("' + escaped + '")',
1049+
{ timeout: 3000 },
1050+
);
1051+
clickedViaPage = true;
1052+
} catch (_) {}
1053+
if (!clickedViaPage) {
1054+
await page.evaluate(`
1055+
(() => {
1056+
const target = ${JSON.stringify(affirm)};
1057+
const btns = Array.from(document.querySelectorAll('button, [role="button"]'));
1058+
const match = btns.find((b) => (b.textContent || '').trim() === target);
1059+
if (!match) return;
1060+
try { match.focus(); } catch (_) {}
1061+
const mouseOpts = { bubbles: true, cancelable: true, view: window };
1062+
try { match.dispatchEvent(new MouseEvent('pointerdown', mouseOpts)); } catch (_) {}
1063+
try { match.dispatchEvent(new MouseEvent('mousedown', mouseOpts)); } catch (_) {}
1064+
try { match.dispatchEvent(new MouseEvent('pointerup', mouseOpts)); } catch (_) {}
1065+
try { match.dispatchEvent(new MouseEvent('mouseup', mouseOpts)); } catch (_) {}
1066+
try { match.dispatchEvent(new MouseEvent('click', mouseOpts)); } catch (_) {}
1067+
try { match.click(); } catch (_) {}
1068+
})()
1069+
`);
1070+
}
10591071
clicked = true;
10601072
await page.sleep(3500);
10611073
continue;
@@ -1121,23 +1133,57 @@ const handlePostLoginChallenge = async () => {
11211133
'okay',
11221134
]);
11231135
if (advance) {
1124-
await page.evaluate(`
1125-
(() => {
1126-
const target = ${JSON.stringify(advance)};
1127-
const btns = Array.from(document.querySelectorAll('button, [role="button"]'));
1128-
const match = btns.find((b) => (b.textContent || '').trim() === target);
1129-
if (!match) return;
1130-
// Try three escalating click strategies for React-rendered buttons.
1131-
try { match.focus(); } catch (_) {}
1132-
const mouseOpts = { bubbles: true, cancelable: true, view: window };
1133-
try { match.dispatchEvent(new MouseEvent('pointerdown', mouseOpts)); } catch (_) {}
1134-
try { match.dispatchEvent(new MouseEvent('mousedown', mouseOpts)); } catch (_) {}
1135-
try { match.dispatchEvent(new MouseEvent('pointerup', mouseOpts)); } catch (_) {}
1136-
try { match.dispatchEvent(new MouseEvent('mouseup', mouseOpts)); } catch (_) {}
1137-
try { match.dispatchEvent(new MouseEvent('click', mouseOpts)); } catch (_) {}
1138-
try { match.click(); } catch (_) {}
1139-
})()
1140-
`);
1136+
const escaped = advance.replace(/"/g, '\\"');
1137+
// Prefer page.click (trusted browser-level events) over evaluate-based
1138+
// dispatch. Instagram rejects synthesized clicks as untrusted. Use a
1139+
// text-match CSS selector — ":has-text()" Playwright locator syntax —
1140+
// with a short timeout so a misselected button doesn't stall the run.
1141+
let clickedViaPage = false;
1142+
try {
1143+
await page.click(
1144+
'button:has-text("' + escaped + '"), [role="button"]:has-text("' + escaped + '")',
1145+
{ timeout: 3000 },
1146+
);
1147+
clickedViaPage = true;
1148+
} catch (_) {
1149+
// Fall through to evaluate-based dispatch as a last-resort fallback.
1150+
}
1151+
// Belt-and-suspenders: also focus + Enter-key the matched button.
1152+
// Some Instagram challenge pages treat keyboard Enter as a more
1153+
// "trusted" interaction signal than synthesized mouse clicks.
1154+
try {
1155+
await page.evaluate(`
1156+
(() => {
1157+
const target = ${JSON.stringify(advance)};
1158+
const btns = Array.from(document.querySelectorAll('button, [role="button"]'));
1159+
const match = btns.find((b) => (b.textContent || '').trim() === target);
1160+
if (match && typeof match.focus === 'function') match.focus();
1161+
})()
1162+
`);
1163+
if (typeof page.keyboard?.press === 'function') {
1164+
await page.keyboard.press('Enter');
1165+
} else if (typeof page.press === 'function') {
1166+
await page.press('body', 'Enter');
1167+
}
1168+
} catch (_) {}
1169+
if (!clickedViaPage) {
1170+
await page.evaluate(`
1171+
(() => {
1172+
const target = ${JSON.stringify(advance)};
1173+
const btns = Array.from(document.querySelectorAll('button, [role="button"]'));
1174+
const match = btns.find((b) => (b.textContent || '').trim() === target);
1175+
if (!match) return;
1176+
try { match.focus(); } catch (_) {}
1177+
const mouseOpts = { bubbles: true, cancelable: true, view: window };
1178+
try { match.dispatchEvent(new MouseEvent('pointerdown', mouseOpts)); } catch (_) {}
1179+
try { match.dispatchEvent(new MouseEvent('mousedown', mouseOpts)); } catch (_) {}
1180+
try { match.dispatchEvent(new MouseEvent('pointerup', mouseOpts)); } catch (_) {}
1181+
try { match.dispatchEvent(new MouseEvent('mouseup', mouseOpts)); } catch (_) {}
1182+
try { match.dispatchEvent(new MouseEvent('click', mouseOpts)); } catch (_) {}
1183+
try { match.click(); } catch (_) {}
1184+
})()
1185+
`);
1186+
}
11411187
clicked = true;
11421188
// Give the next page/step a moment to render.
11431189
await page.sleep(3500);
@@ -1431,6 +1477,7 @@ const collectProfileViaPageCapture = async (username) => {
14311477
await page.sleep(PROFILE_CAPTURE_WAIT_MS);
14321478

14331479
for (let attempt = 0; attempt < PROFILE_CAPTURE_MAX_ATTEMPTS; attempt++) {
1480+
if (runtimeWedgeDetected) break;
14341481
await setCollectorTraceSection('profileBootstrap', {
14351482
method: 'profile_page_capture',
14361483
step: 'wait_for_profile_capture',
@@ -1440,7 +1487,13 @@ const collectProfileViaPageCapture = async (username) => {
14401487
captureKey: 'profileResponse',
14411488
});
14421489
await page.sleep(1000);
1443-
const response = await page.getCapturedResponse('profileResponse');
1490+
let response;
1491+
try {
1492+
response = await page.getCapturedResponse('profileResponse');
1493+
} catch (err) {
1494+
markWedgeIfMatches(err);
1495+
break;
1496+
}
14441497
const user =
14451498
response?.data?.data?.user ??
14461499
response?.data?.user ??
@@ -1762,60 +1815,103 @@ const fetchAccountsCenterHtml = async (path, traceSection) => {
17621815
targetUrl: fullUrl,
17631816
});
17641817
}
1765-
const reachable = await safeGoto(fullUrl);
1766-
if (!reachable) {
1767-
if (traceSection) {
1768-
await setCollectorTraceSection(traceSection, {
1769-
phase: traceSection,
1770-
method: 'accounts_center_html',
1771-
step: 'navigate_accounts_center',
1772-
status: 'error',
1773-
outcome: 'page_unreachable',
1774-
path,
1775-
targetUrl: fullUrl,
1776-
});
1818+
// Strategy: navigate to a light instagram.com page once and then XHR-fetch
1819+
// the Accounts Center path from the existing tab. This avoids letting the
1820+
// heavy Accounts Center React app load, which has been observed to wedge
1821+
// the remote browser runtime after 1-2 navigations. Cookies on the
1822+
// instagram.com domain cover accountscenter.instagram.com (subdomain).
1823+
const alreadyOnInstagram = await page.evaluate(
1824+
`(() => { try { return location.hostname.endsWith('instagram.com'); } catch (_) { return false; } })()`
1825+
).catch(() => false);
1826+
if (!alreadyOnInstagram) {
1827+
const reachable = await safeGoto('https://www.instagram.com/');
1828+
if (!reachable) {
1829+
if (traceSection) {
1830+
await setCollectorTraceSection(traceSection, {
1831+
phase: traceSection,
1832+
method: 'accounts_center_html',
1833+
step: 'navigate_accounts_center',
1834+
status: 'error',
1835+
outcome: 'anchor_page_unreachable',
1836+
path,
1837+
targetUrl: fullUrl,
1838+
});
1839+
}
1840+
throw new Error('instagram.com anchor page could not be reached');
17771841
}
1778-
throw new Error('accounts center page could not be reached for ' + path);
17791842
}
17801843
if (traceSection) {
17811844
await setCollectorTraceSection(traceSection, {
17821845
phase: traceSection,
17831846
method: 'accounts_center_html',
17841847
step: 'read_accounts_center_html',
1785-
status: 'waiting_html',
1848+
status: 'fetching_via_xhr',
17861849
path,
17871850
targetUrl: fullUrl,
17881851
});
17891852
}
1790-
await page.sleep(1500);
1791-
// Previously: `await page.evaluate('document.documentElement.outerHTML')` —
1792-
// returning the FULL Accounts Center HTML (often multi-MB) across the
1793-
// remote browser boundary. Large payloads are the documented cause of
1794-
// runtime wedges (docs/2026-04-17-instagram-runtime-root-cause-plan.md)
1795-
// and the motivation for the closed #168. We now project inside the page:
1796-
// extract the data-sjs script bodies and shallow fb_dtsg/lsd/jazoest
1797-
// tokens from the meta HTML, then stringify the HTML back on the host
1798-
// only if downstream parsers need it. Most callers immediately run
1799-
// `extractDataSjsBlocks(html)` — we return a preparsed blob so the host
1800-
// never sees the raw document.
1801-
const shrunk = await page.evaluate(`
1802-
(() => {
1803-
try {
1804-
const scripts = Array.from(document.querySelectorAll('script[type="application/json"][data-sjs]'));
1805-
const payloads = [];
1806-
for (const el of scripts) {
1807-
const text = el.textContent || '';
1808-
if (!text.includes('fxcal_settings')) continue;
1809-
payloads.push(text);
1853+
// Fetch the Accounts Center page as raw HTML via XHR. Parse data-sjs
1854+
// scripts inside page so we only return the small payloads we need.
1855+
let shrunk;
1856+
try {
1857+
shrunk = await page.evaluate(`
1858+
(async () => {
1859+
try {
1860+
const res = await fetch(${JSON.stringify(fullUrl)}, {
1861+
credentials: 'include',
1862+
headers: {
1863+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1864+
'sec-fetch-mode': 'navigate',
1865+
'sec-fetch-site': 'same-site',
1866+
'sec-fetch-dest': 'document',
1867+
},
1868+
});
1869+
if (!res.ok) return { error: 'xhr status ' + res.status };
1870+
const html = await res.text();
1871+
const payloads = [];
1872+
const re = /<script type="application\\/json"[^>]*data-sjs[^>]*>([\\s\\S]*?)<\\/script>/g;
1873+
let m;
1874+
while ((m = re.exec(html)) !== null) {
1875+
if (!m[1].includes('fxcal_settings')) continue;
1876+
payloads.push(m[1]);
1877+
}
1878+
const metaSlice = html.slice(0, 80000);
1879+
return { payloads, metaSlice, scriptCount: payloads.length, htmlLength: html.length };
1880+
} catch (err) {
1881+
return { error: err && err.message ? err.message : String(err) };
18101882
}
1811-
const fullHtml = document.documentElement.outerHTML;
1812-
const metaSlice = fullHtml.slice(0, 80000);
1813-
return { payloads, metaSlice, scriptCount: scripts.length };
1814-
} catch (err) {
1815-
return { error: err && err.message ? err.message : String(err) };
1816-
}
1817-
})()
1818-
`);
1883+
})()
1884+
`);
1885+
} catch (err) {
1886+
markWedgeIfMatches(err);
1887+
throw err;
1888+
}
1889+
if (shrunk && shrunk.error) {
1890+
// Fall back to full navigation once if XHR failed (auth, CORS, etc.).
1891+
const reachable = await safeGoto(fullUrl);
1892+
if (!reachable) {
1893+
throw new Error('accounts center page could not be reached for ' + path);
1894+
}
1895+
await page.sleep(1500);
1896+
shrunk = await page.evaluate(`
1897+
(() => {
1898+
try {
1899+
const scripts = Array.from(document.querySelectorAll('script[type="application/json"][data-sjs]'));
1900+
const payloads = [];
1901+
for (const el of scripts) {
1902+
const text = el.textContent || '';
1903+
if (!text.includes('fxcal_settings')) continue;
1904+
payloads.push(text);
1905+
}
1906+
const fullHtml = document.documentElement.outerHTML;
1907+
const metaSlice = fullHtml.slice(0, 80000);
1908+
return { payloads, metaSlice, scriptCount: scripts.length };
1909+
} catch (err) {
1910+
return { error: err && err.message ? err.message : String(err) };
1911+
}
1912+
})()
1913+
`);
1914+
}
18191915
const payloads = (shrunk && shrunk.payloads) || [];
18201916
const metaSlice = (shrunk && shrunk.metaSlice) || '';
18211917
// Reconstruct a minimal "html" that extractDataSjsBlocks + extractMetaTokens

0 commit comments

Comments
 (0)