From bb61b43125c8427eb0226618b4b4ca3183790a94 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Tue, 2 Jun 2026 15:42:51 -0700 Subject: [PATCH] Add latest sort for Xiaohongshu search Signed-off-by: Connor1996 --- cli-manifest.json | 11 +++ clis/rednote/search.js | 2 +- clis/xiaohongshu/search.js | 116 ++++++++++++++++++++++++++++++- clis/xiaohongshu/search.test.js | 119 +++++++++++++++++++++++++++++++- 4 files changed, 244 insertions(+), 4 deletions(-) diff --git a/cli-manifest.json b/cli-manifest.json index fd1a13229..7e726c0cd 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -34317,6 +34317,17 @@ "default": 20, "required": false, "help": "Number of results" + }, + { + "name": "sort", + "type": "string", + "default": "general", + "required": false, + "help": "Sort order: general | latest", + "choices": [ + "general", + "latest" + ] } ], "columns": [ diff --git a/clis/rednote/search.js b/clis/rednote/search.js index d4706d307..9c858cf24 100644 --- a/clis/rednote/search.js +++ b/clis/rednote/search.js @@ -101,7 +101,7 @@ cli({ .map((item, i) => ({ rank: i + 1, ...item, - published_at: noteIdToDate(item.url), + published_at: item.published_at || noteIdToDate(item.url), })); }, }); diff --git a/clis/xiaohongshu/search.js b/clis/xiaohongshu/search.js index a56475bad..9a79834d2 100644 --- a/clis/xiaohongshu/search.js +++ b/clis/xiaohongshu/search.js @@ -60,6 +60,11 @@ export function stripXhsAuthorDateSuffix(value) { const stripped = text.replace(/\s*(?:\d{1,2}天前|\d+小时前|\d+分钟前|\d+秒前|刚刚|昨天|前天|\d+周前|\d+个月前|\d{1,2}-\d{1,2}|\d{4}-\d{1,2}-\d{1,2})$/u, '').trim(); return stripped || text; } +export function extractXhsPublishText(value) { + const text = (value || '').replace(/\s+/g, ' ').trim(); + const match = text.match(/(?:\d{1,2}天前|\d+小时前|\d+分钟前|\d+秒前|刚刚|昨天(?:\s+\d{1,2}:\d{2})?|前天(?:\s+\d{1,2}:\d{2})?|\d+周前|\d+个月前|\d{1,2}-\d{1,2}|\d{4}-\d{1,2}-\d{1,2})$/u); + return match ? match[0] : ''; +} /** * `page.evaluate` may return either the raw IIFE value or a * `{ session, data }` envelope depending on the browser-bridge version. @@ -80,6 +85,43 @@ function requireSearchRows(payload, phase) { } return rows; } +function requireSortOptionIndex(payload) { + const result = unwrapEvaluateResult(payload); + if (!result || typeof result !== 'object' || result.ok !== true) { + const reason = result && typeof result === 'object' && 'reason' in result ? result.reason : 'unknown'; + throw new CommandExecutionError(`Xiaohongshu search could not apply --sort latest (${reason}).`); + } + if (!Number.isSafeInteger(result.index) || result.index < 0) { + throw new CommandExecutionError('Xiaohongshu search could not apply --sort latest (invalid_option_index).'); + } + return result.index; +} +export function buildDismissKnownXhsOverlaysJs() { + return ` + (() => { + const cleanText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const isVisible = (el) => { + const rect = el.getBoundingClientRect(); + if (rect.width <= 0 || rect.height <= 0) return false; + const style = getComputedStyle(el); + return style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'; + }; + const isBlockingNotice = (text) => /温馨提示|广告屏蔽|插件|申诉|浏览器|正常使用|风险/.test(text); + let clicked = 0; + for (const button of Array.from(document.querySelectorAll('button, [role="button"]'))) { + if (!isVisible(button)) continue; + const text = cleanText(button.innerText || button.textContent || ''); + if (text !== '我知道了' && text !== '知道了') continue; + const container = button.closest('[role="dialog"], .d-modal, .reds-modal, .el-dialog, body'); + const noticeText = cleanText(container?.innerText || ''); + if (!isBlockingNotice(noticeText)) continue; + button.click(); + clicked++; + } + return { ok: true, clicked }; + })() + `; +} export function parseLimit(raw) { const parsed = Number(raw ?? 20); if (!Number.isFinite(parsed) || !Number.isInteger(parsed)) { @@ -90,6 +132,14 @@ export function parseLimit(raw) { } return parsed; } +export function parseSort(raw) { + const value = String(raw ?? 'general').trim().toLowerCase(); + if (value === 'general' || value === '综合') + return 'general'; + if (value === 'latest' || value === '最新') + return 'latest'; + throw new ArgumentError(`--sort must be one of: general, latest, got ${JSON.stringify(raw)}`); +} /** * Build a "scroll until enough or plateaued" IIFE used in place of a fixed * `autoScroll({ times: N })`. Xiaohongshu's search results page lazy-loads @@ -174,6 +224,49 @@ export function buildScrollUntilJs(targetCount, maxScrolls = 15) { })() `; } +export function buildSearchSortOptionIndexJs(sort) { + const label = sort === 'latest' ? '最新' : '综合'; + return ` + (() => { + const targetLabel = ${JSON.stringify(label)}; + const cleanText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const isVisible = (el) => { + const rect = el.getBoundingClientRect(); + if (rect.width <= 0 || rect.height <= 0) return false; + const style = getComputedStyle(el); + return style.display !== 'none' && style.visibility !== 'hidden'; + }; + const visibleTextIs = (el, text) => cleanText(el.innerText || el.textContent || '') === text; + const allTags = Array.from(document.querySelectorAll('.filter-panel .tags')); + if (allTags.length === 0) return { ok: false, reason: 'filter_panel_not_found' }; + let index = allTags.findIndex((el) => isVisible(el) && visibleTextIs(el, targetLabel) && !el.classList.contains('active')); + if (index < 0) { + index = allTags.findIndex((el) => isVisible(el) && visibleTextIs(el, targetLabel)); + } + if (index < 0) return { ok: false, reason: 'sort_option_not_found', label: targetLabel }; + return { ok: true, label: targetLabel, index }; + })() + `; +} +async function applySearchSort(page, sort) { + await page.evaluate(buildDismissKnownXhsOverlaysJs()); + await page.wait({ time: 0.2 }); + let lastResult = null; + for (let attempt = 0; attempt < 3; attempt++) { + await page.click('.search-layout__top .filter span'); + for (let poll = 0; poll < 5; poll++) { + await page.wait({ time: 0.2 }); + lastResult = unwrapEvaluateResult(await page.evaluate(buildSearchSortOptionIndexJs(sort))); + if (lastResult && typeof lastResult === 'object' && lastResult.ok === true) { + const optionIndex = requireSortOptionIndex(lastResult); + await page.click('.filter-panel .tags', { nth: optionIndex }); + await page.wait({ time: 1.5 }); + return; + } + } + } + requireSortOptionIndex(lastResult); +} /** * Build the search-result extraction IIFE. The web host is baked into the * `normalizeUrl` fallback so relative `/explore/...` hrefs resolve to a full @@ -192,6 +285,7 @@ export function buildSearchExtractJs(webHost) { const cleanText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); const stripXhsAuthorDateSuffix = ${stripXhsAuthorDateSuffix.toString()}; + const extractXhsPublishText = ${extractXhsPublishText.toString()}; const isVisibleNote = (el) => { const rect = el.getBoundingClientRect(); if (rect.width <= 0 || rect.height <= 0) return false; @@ -225,9 +319,20 @@ export function buildSearchExtractJs(webHost) { const nameEl = el.querySelector('a.author .name, .author-name, .nick-name, .name'); const authorWrapEl = el.querySelector('a.author'); let author = cleanText(nameEl?.textContent || ''); + let publishedAt = ''; if (!author && authorWrapEl) { const nameChild = authorWrapEl.querySelector('.name'); - author = nameChild ? cleanText(nameChild.textContent || '') : stripXhsAuthorDateSuffix(authorWrapEl.textContent || ''); + const authorCandidates = Array.from(authorWrapEl.querySelectorAll('*')) + .map((node) => cleanText(node.textContent || '')) + .filter((text) => text && !extractXhsPublishText(text)); + author = nameChild ? cleanText(nameChild.textContent || '') : (authorCandidates[0] || stripXhsAuthorDateSuffix(authorWrapEl.textContent || '')); + } + if (authorWrapEl) { + const publishCandidates = Array.from(authorWrapEl.querySelectorAll('*')) + .map((node) => extractXhsPublishText(node.textContent || '')) + .filter(Boolean) + .sort((a, b) => a.length - b.length); + publishedAt = publishCandidates[0] || extractXhsPublishText(authorWrapEl.textContent || ''); } const likesEl = el.querySelector('.count, .like-count, .like-wrapper .count'); // Prefer search_result link (preserves xsec_token) over generic /explore/ link @@ -258,6 +363,7 @@ export function buildSearchExtractJs(webHost) { title, author, likes: cleanText(likesEl?.textContent || '0'), + published_at: publishedAt, url, author_url: normalizeUrl(authorLinkEl?.getAttribute('href') || ''), }); @@ -278,10 +384,12 @@ export const command = cli({ args: [ { name: 'query', required: true, positional: true, help: 'Search keyword' }, { name: 'limit', type: 'int', default: 20, help: 'Number of results' }, + { name: 'sort', type: 'string', default: 'general', choices: ['general', 'latest'], help: 'Sort order: general | latest' }, ], columns: ['rank', 'title', 'author', 'likes', 'published_at', 'url'], func: async (page, kwargs) => { const limit = parseLimit(kwargs.limit); + const sort = parseSort(kwargs.sort); const keyword = encodeURIComponent(kwargs.query); await page.goto(`https://www.xiaohongshu.com/search_result?keyword=${keyword}&source=web_search_result_notes`); // Wait for search results to render (or login wall to appear). @@ -291,6 +399,9 @@ export const command = cli({ if (waitResult === 'login_wall') { throw new AuthRequiredError('www.xiaohongshu.com', 'Xiaohongshu search results are blocked behind a login wall'); } + if (sort === 'latest') { + await applySearchSort(page, sort); + } // Extract before scrolling. Xiaohongshu uses a virtualized masonry // layout, so scrolling to the bottom can evict the initially visible // note cards from the DOM and make extraction return [] even though the @@ -321,10 +432,11 @@ export const command = cli({ .map((item, i) => ({ rank: i + 1, ...item, - published_at: noteIdToDate(item.url), + published_at: item.published_at || noteIdToDate(item.url), })); }, }); export const __test__ = { stripXhsAuthorDateSuffix, + extractXhsPublishText, }; diff --git a/clis/xiaohongshu/search.test.js b/clis/xiaohongshu/search.test.js index 88b96ab67..20b4ecf80 100644 --- a/clis/xiaohongshu/search.test.js +++ b/clis/xiaohongshu/search.test.js @@ -1,7 +1,7 @@ import { describe, expect, it, vi } from 'vitest'; import { getRegistry } from '@jackwener/opencli/registry'; import { JSDOM } from 'jsdom'; -import { __test__, buildScrollUntilJs, noteIdToDate, unwrapEvaluateResult } from './search.js'; +import { __test__, buildDismissKnownXhsOverlaysJs, buildScrollUntilJs, buildSearchSortOptionIndexJs, noteIdToDate, parseSort, unwrapEvaluateResult } from './search.js'; function markVisible(el) { el.getBoundingClientRect = () => ({ width: 100, height: 100 }); @@ -45,6 +45,16 @@ describe('xiaohongshu search', () => { }); expect(page.goto).not.toHaveBeenCalled(); }); + it('rejects invalid sort before browser navigation', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + const page = createPageMock([]); + + await expect(cmd.func(page, { query: '特斯拉', limit: 5, sort: 'date' })).rejects.toMatchObject({ + code: 'ARGUMENT', + message: expect.stringContaining('--sort'), + }); + expect(page.goto).not.toHaveBeenCalled(); + }); it('throws a clear error when the search page is blocked by a login wall', async () => { const cmd = getRegistry().get('xiaohongshu/search'); expect(cmd?.func).toBeTypeOf('function'); @@ -104,6 +114,55 @@ describe('xiaohongshu search', () => { }, ]); }); + it('applies the latest search filter through the page UI before extracting', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + const detailUrl = 'https://www.xiaohongshu.com/search_result/6a1ded130000000013020400?xsec_token=test-token&xsec_source='; + const rows = [ + { + title: '转租Santa Clara Orchard Glen公寓', + author: 'Jack Wang', + likes: '3', + url: detailUrl, + author_url: '', + }, + ]; + const page = createPageMock([ + 'content', + { ok: true, clicked: 1 }, + { session: 'site:xiaohongshu', data: { ok: true, label: '最新', index: 2 } }, + rows, + ]); + + const result = await cmd.func(page, { query: '湾区租房', limit: 1, sort: 'latest' }); + + expect(page.evaluate).toHaveBeenCalledTimes(4); + expect(String(page.evaluate.mock.calls[1][0])).toContain('广告屏蔽'); + expect(String(page.evaluate.mock.calls[2][0])).toContain('targetLabel = "最新"'); + expect(page.click).toHaveBeenNthCalledWith(1, '.search-layout__top .filter span'); + expect(page.click).toHaveBeenNthCalledWith(2, '.filter-panel .tags', { nth: 2 }); + expect(page.wait).toHaveBeenNthCalledWith(1, { time: 0.2 }); + expect(page.wait).toHaveBeenNthCalledWith(2, { time: 0.2 }); + expect(page.wait).toHaveBeenNthCalledWith(3, { time: 1.5 }); + expect(result[0]).toMatchObject({ + rank: 1, + title: '转租Santa Clara Orchard Glen公寓', + published_at: '2026-06-02', + }); + }); + it('fails typed when the latest search filter cannot be applied', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + const page = createPageMock([ + 'content', + { ok: true, clicked: 0 }, + ...Array.from({ length: 15 }, () => ({ ok: false, reason: 'sort_option_not_found' })), + ]); + + await expect(cmd.func(page, { query: '湾区租房', limit: 1, sort: 'latest' })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + message: expect.stringContaining('sort_option_not_found'), + }); + expect(page.evaluate).toHaveBeenCalledTimes(17); + }); it('fails typed instead of silently returning [] for malformed extraction payloads', async () => { const cmd = getRegistry().get('xiaohongshu/search'); const page = createPageMock([ @@ -216,10 +275,35 @@ describe('xiaohongshu search', () => { expect(result[0]).toMatchObject({ title: '数字作者测试', author: '数字3天前端', + published_at: '3天前', likes: '8', author_url: 'https://www.xiaohongshu.com/user/profile/author123', }); }); + it('does not merge an author-name trailing digit into the publish-time text', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + const dom = new JSDOM(` +
+ +
湾区找室友|7月Palo Alto附近上班女生
+ +
Wonyii_7
1小时前
+
+ 4 +
+ `, { url: 'https://www.xiaohongshu.com/search_result?keyword=test' }); + markVisible(dom.window.document.querySelector('section.note-item')); + const page = createPageMock([]); + page.evaluate.mockImplementationOnce(async () => 'content'); + page.evaluate.mockImplementationOnce(async (script) => Function('document', 'getComputedStyle', `return (${script})`)(dom.window.document, dom.window.getComputedStyle.bind(dom.window))); + + const result = await cmd.func(page, { query: '测试', limit: 1 }); + + expect(result[0]).toMatchObject({ + author: 'Wonyii_7', + published_at: '1小时前', + }); + }); }); describe('buildScrollUntilJs', () => { it('inlines the target count and default maxScrolls into the generated IIFE', () => { @@ -258,6 +342,32 @@ describe('buildScrollUntilJs', () => { expect(() => buildScrollUntilJs(10, 0)).toThrow(/maxScrolls/); }); }); +describe('parseSort', () => { + it('normalizes supported English and Chinese sort labels', () => { + expect(parseSort(undefined)).toBe('general'); + expect(parseSort('general')).toBe('general'); + expect(parseSort('综合')).toBe('general'); + expect(parseSort('latest')).toBe('latest'); + expect(parseSort('最新')).toBe('latest'); + }); + it('rejects unknown sort labels', () => { + expect(() => parseSort('date')).toThrow(/--sort/); + }); +}); +describe('buildSearchSortOptionIndexJs', () => { + it('targets the Xiaohongshu latest label for latest sort', () => { + expect(buildSearchSortOptionIndexJs('latest')).toContain('targetLabel = "最新"'); + }); + it('targets the Xiaohongshu general label for default sort', () => { + expect(buildSearchSortOptionIndexJs('general')).toContain('targetLabel = "综合"'); + }); +}); +describe('buildDismissKnownXhsOverlaysJs', () => { + it('targets common Xiaohongshu blocking notice text', () => { + expect(buildDismissKnownXhsOverlaysJs()).toContain('广告屏蔽'); + expect(buildDismissKnownXhsOverlaysJs()).toContain('我知道了'); + }); +}); describe('stripXhsAuthorDateSuffix', () => { it('only strips trailing date suffixes and preserves date-like author text', () => { expect(__test__.stripXhsAuthorDateSuffix('作者名 3天前')).toBe('作者名'); @@ -267,6 +377,13 @@ describe('stripXhsAuthorDateSuffix', () => { expect(__test__.stripXhsAuthorDateSuffix('刚刚')).toBe('刚刚'); }); }); +describe('extractXhsPublishText', () => { + it('extracts visible Xiaohongshu publish-time labels only at the end', () => { + expect(__test__.extractXhsPublishText('作者名 30分钟前')).toBe('30分钟前'); + expect(__test__.extractXhsPublishText('作者名 昨天 10:16')).toBe('昨天 10:16'); + expect(__test__.extractXhsPublishText('数字3天前端')).toBe(''); + }); +}); describe('noteIdToDate (ObjectID timestamp parsing)', () => { it('parses a known note ID to the correct China-timezone date', () => { // 0x697f6c74 = 1769958516 → 2026-02-01 in UTC+8