Skip to content

Commit 4d25b2b

Browse files
leozejiajackwener
andauthored
fix(jianyu): block inaccessible detail links and verification pages (#918)
* fix(jianyu): filter blocked detail links * fix(jianyu): keep recency filter opt-in --------- Co-authored-by: jackwener <jakevingoo@gmail.com>
1 parent a0b2155 commit 4d25b2b

5 files changed

Lines changed: 195 additions & 9 deletions

File tree

clis/jianyu/search.js

Lines changed: 145 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ const NAVIGATION_PATH_PREFIXES = [
3535
'/exhibition/',
3636
'/swordfish/page_big_pc/search/',
3737
];
38+
const BLOCKED_DETAIL_PATH_PREFIXES = [
39+
'/nologin/content/',
40+
'/article/bdprivate/',
41+
];
3842
const JIANYU_API_TYPES = ['fType', 'eType', 'vType', 'mType'];
3943
export function buildSearchUrl(query) {
4044
const url = new URL(SEARCH_ENTRY);
@@ -74,6 +78,92 @@ function isLikelyNavigationUrl(rawUrl) {
7478
return true;
7579
}
7680
}
81+
function classifyDetailStatus(rawUrl) {
82+
const urlText = cleanText(rawUrl);
83+
if (!urlText) {
84+
return {
85+
detail_status: 'blocked',
86+
detail_reason: 'missing_url',
87+
};
88+
}
89+
try {
90+
const parsed = new URL(urlText);
91+
const path = cleanText(parsed.pathname).toLowerCase().replace(/\/+$/, '/') || '/';
92+
if (path.includes('/jybx/')) {
93+
return {
94+
detail_status: 'ok',
95+
detail_reason: 'jybx_detail',
96+
};
97+
}
98+
if (BLOCKED_DETAIL_PATH_PREFIXES.some((prefix) => path.includes(prefix))) {
99+
return {
100+
detail_status: 'blocked',
101+
detail_reason: 'verification_or_paid_wall',
102+
};
103+
}
104+
if (isLikelyNavigationUrl(urlText)) {
105+
return {
106+
detail_status: 'entry_only',
107+
detail_reason: 'navigation_or_profile_entry',
108+
};
109+
}
110+
return {
111+
detail_status: 'entry_only',
112+
detail_reason: 'non_jybx_entry',
113+
};
114+
}
115+
catch {
116+
return {
117+
detail_status: 'blocked',
118+
detail_reason: 'invalid_url',
119+
};
120+
}
121+
}
122+
function extractNoticeId(rawUrl) {
123+
const value = cleanText(rawUrl);
124+
if (!value)
125+
return '';
126+
try {
127+
const parsed = new URL(value);
128+
const path = cleanText(parsed.pathname);
129+
const jybxMatched = path.match(/\/jybx\/([^/?#]+)\.html$/i);
130+
if (jybxMatched?.[1])
131+
return cleanText(jybxMatched[1]);
132+
const segments = path.split('/').filter(Boolean);
133+
const tail = cleanText(segments[segments.length - 1] || '');
134+
return cleanText(tail.replace(/\.html?$/i, ''));
135+
}
136+
catch {
137+
return '';
138+
}
139+
}
140+
function isWithinSinceDays(dateText, sinceDays, now = new Date()) {
141+
const normalized = normalizeDate(dateText);
142+
if (!normalized)
143+
return false;
144+
const timestamp = Date.parse(`${normalized}T00:00:00Z`);
145+
if (!Number.isFinite(timestamp))
146+
return false;
147+
const today = Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate());
148+
const deltaDays = Math.floor((today - timestamp) / (24 * 3600 * 1000));
149+
return deltaDays >= 0 && deltaDays <= sinceDays;
150+
}
151+
function dedupeByNoticeKey(items) {
152+
const deduped = [];
153+
const seen = new Set();
154+
for (const item of items) {
155+
const source = cleanText(item.source_id || '');
156+
const notice = cleanText(item.notice_id || '');
157+
const key = source && notice
158+
? `${source}\t${notice}`
159+
: `${cleanText(item.title)}\t${cleanText(item.url)}`;
160+
if (!key || seen.has(key))
161+
continue;
162+
seen.add(key);
163+
deduped.push(item);
164+
}
165+
return deduped;
166+
}
77167
function filterNavigationRows(query, items) {
78168
const queryTokens = cleanText(query).split(/\s+/).filter(Boolean).map((token) => token.toLowerCase());
79169
return items
@@ -86,6 +176,9 @@ function filterNavigationRows(query, items) {
86176
.filter((item) => {
87177
if (!item.title || !item.url)
88178
return false;
179+
const detailSignal = classifyDetailStatus(item.url);
180+
if (detailSignal.detail_status !== 'ok')
181+
return false;
89182
const haystack = `${item.title} ${item.contextText}`.toLowerCase();
90183
const hasQuery = queryTokens.length === 0 || queryTokens.some((token) => haystack.includes(token));
91184
const hasProcurementHint = PROCUREMENT_TITLE_HINT.test(`${item.title} ${item.contextText}`);
@@ -446,11 +539,16 @@ cli({
446539
args: [
447540
{ name: 'query', required: true, positional: true, help: 'Search keyword, e.g. "procurement"' },
448541
{ name: 'limit', type: 'int', default: 20, help: 'Number of results (max 50)' },
542+
{ name: 'since_days', type: 'int', help: 'Only keep rows published within N days' },
449543
],
450-
columns: ['rank', 'content_type', 'title', 'publish_time', 'project_code', 'budget_or_limit', 'url'],
544+
columns: ['rank', 'content_type', 'title', 'published_at', 'detail_status', 'project_code', 'budget_or_limit', 'url'],
451545
func: async (page, kwargs) => {
452546
const query = cleanText(kwargs.query);
453547
const limit = Math.max(1, Math.min(Number(kwargs.limit) || 20, 50));
548+
const rawSinceDays = Number(kwargs.since_days);
549+
const sinceDays = Number.isFinite(rawSinceDays) && rawSinceDays > 0
550+
? Math.max(1, Math.min(rawSinceDays, 3650))
551+
: null;
454552
const apiResult = await fetchJianyuApiRows(page, query, limit);
455553
const mergedRows = dedupeCandidates(filterNavigationRows(query, apiResult.rows));
456554
const extractedRows = await searchRowsFromEntries(page, {
@@ -465,21 +563,61 @@ cli({
465563
const indexedRows = await fetchDuckDuckGoIndexRows(query, limit);
466564
const filteredIndexedRows = dedupeCandidates(filterNavigationRows(query, indexedRows));
467565
if (filteredIndexedRows.length > 0) {
468-
return toProcurementSearchRecords(filteredIndexedRows, {
566+
const records = toProcurementSearchRecords(filteredIndexedRows, {
469567
site: SITE,
470568
query,
471569
limit,
472570
});
571+
const enriched = dedupeByNoticeKey(records.map((row) => {
572+
const detailSignal = classifyDetailStatus(row.url);
573+
const publishedAt = normalizeDate(row.publish_time || row.date);
574+
return {
575+
...row,
576+
source_id: SITE,
577+
notice_id: extractNoticeId(row.url),
578+
published_at: publishedAt,
579+
detail_status: detailSignal.detail_status,
580+
detail_reason: detailSignal.detail_reason,
581+
};
582+
}))
583+
.filter((row) => row.detail_status === 'ok')
584+
.filter((row) => sinceDays == null || isWithinSinceDays(row.published_at, sinceDays))
585+
.slice(0, limit)
586+
.map((row, index) => ({
587+
...row,
588+
rank: index + 1,
589+
}));
590+
return enriched;
473591
}
474592
if (apiResult.challenge || await isAuthRequired(page)) {
475593
throw new AuthRequiredError(DOMAIN, '[taxonomy=selector_drift] site=jianyu command=search blocked by human verification / access challenge');
476594
}
477595
}
478-
return toProcurementSearchRecords(rows, {
596+
const records = toProcurementSearchRecords(rows, {
479597
site: SITE,
480598
query,
481599
limit,
482600
});
601+
const enriched = dedupeByNoticeKey(records.map((row) => {
602+
const detailSignal = classifyDetailStatus(row.url);
603+
const publishedAt = normalizeDate(row.publish_time || row.date);
604+
return {
605+
...row,
606+
source_id: SITE,
607+
notice_id: extractNoticeId(row.url),
608+
published_at: publishedAt,
609+
detail_status: detailSignal.detail_status,
610+
detail_reason: detailSignal.detail_reason,
611+
};
612+
}))
613+
.filter((row) => row.detail_status === 'ok')
614+
.filter((row) => sinceDays == null || isWithinSinceDays(row.published_at, sinceDays))
615+
.slice(0, limit)
616+
.map((row, index) => ({
617+
...row,
618+
rank: index + 1,
619+
}));
620+
return enriched;
483621
},
484622
});
485623
export const __test__ = {
@@ -494,4 +632,8 @@ export const __test__ = {
494632
normalizeApiRow,
495633
fetchJianyuApiRows,
496634
collectApiRowsFromResponses,
635+
classifyDetailStatus,
636+
extractNoticeId,
637+
isWithinSinceDays,
638+
dedupeByNoticeKey,
497639
};

clis/jianyu/search.test.js

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ describe('jianyu search helpers', () => {
3131
const filtered = __test__.filterNavigationRows('电梯', [
3232
{ title: '招标公告', url: 'https://www.jianyu360.cn/list/stype/ZBGG.html', date: '' },
3333
{ title: '帮助中心', url: 'https://www.jianyu360.cn/helpCenter/index', date: '' },
34-
{ title: '某项目电梯采购公告', url: 'https://www.jianyu360.cn/notice/detail/123', date: '2026-04-07' },
34+
{ title: '某项目电梯采购公告', url: 'https://shandong.jianyu360.cn/jybx/20260407_123.html', date: '2026-04-07' },
3535
]);
3636
expect(filtered).toHaveLength(1);
3737
expect(filtered[0].title).toContain('电梯采购公告');
@@ -125,4 +125,20 @@ describe('jianyu search helpers', () => {
125125
expect(result.rows[0].title).toContain('电梯采购公告');
126126
expect(result.rows[1].title).toContain('另一条电梯采购公告');
127127
});
128+
it('classifies nologin links as blocked detail targets', () => {
129+
const signal = __test__.classifyDetailStatus('https://www.jianyu360.cn/nologin/content/ABC.html');
130+
expect(signal.detail_status).toBe('blocked');
131+
});
132+
it('extracts stable notice id from jybx urls', () => {
133+
const id = __test__.extractNoticeId('https://shandong.jianyu360.cn/jybx/20260310_26030938267551.html');
134+
expect(id).toBe('20260310_26030938267551');
135+
});
136+
it('keeps only rows inside recency window', () => {
137+
const within = __test__.isWithinSinceDays('2026-03-20', 30, new Date('2026-04-09T00:00:00Z'));
138+
const stale = __test__.isWithinSinceDays('2026-02-01', 30, new Date('2026-04-09T00:00:00Z'));
139+
const missing = __test__.isWithinSinceDays('', 30, new Date('2026-04-09T00:00:00Z'));
140+
expect(within).toBe(true);
141+
expect(stale).toBe(false);
142+
expect(missing).toBe(false);
143+
});
128144
});

clis/jianyu/shared/procurement-detail.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ const RETRYABLE_DETAIL_ERROR_PATTERNS = [
77
/cannot find context with specified id/i,
88
/\[taxonomy=empty_result\]/i,
99
];
10+
const DETAIL_AUTH_CHALLENGE_PATTERNS = [
11+
//i,
12+
//i,
13+
//i,
14+
//i,
15+
//i,
16+
];
1017
function isRetryableDetailError(error) {
1118
const message = error instanceof Error
1219
? cleanText(error.message)
@@ -61,6 +68,14 @@ export async function runProcurementDetail(page, { url, site, query = '', }) {
6168
const title = cleanText(row.title);
6269
const detailText = cleanText(row.detailText);
6370
const publishTime = cleanText(row.publishTime);
71+
const authGateText = cleanText(`${title} ${detailText}`);
72+
if (DETAIL_AUTH_CHALLENGE_PATTERNS.some((pattern) => pattern.test(authGateText))) {
73+
throw taxonomyError('selector_drift', {
74+
site,
75+
command: 'detail',
76+
detail: `detail page blocked by verification challenge: ${targetUrl}`,
77+
});
78+
}
6479
if (!title && !detailText) {
6580
throw taxonomyError('empty_result', {
6681
site,

clis/jianyu/shared/procurement-detail.test.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,16 @@ describe('procurement detail runner', () => {
6969
})).rejects.toThrow('[taxonomy=extraction_drift]');
7070
expect(attempts).toBe(1);
7171
});
72+
it('rejects captcha/verification pages as selector_drift', async () => {
73+
const page = createPage(async () => ({
74+
title: '验证码',
75+
detailText: '请在下图依次点击:槨畽黛',
76+
publishTime: '',
77+
}));
78+
await expect(runProcurementDetail(page, {
79+
url: 'https://www.jianyu360.cn/nologin/content/ABC.html',
80+
site: 'jianyu',
81+
query: '电梯',
82+
})).rejects.toThrow('[taxonomy=selector_drift]');
83+
});
7284
});

docs/adapters/browser/jianyu.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
| Command | Description |
88
|---------|-------------|
9-
| `opencli jianyu search "<query>" --limit <n>` | Search Jianyu bid notices (V2 structured contract) |
9+
| `opencli jianyu search "<query>" --limit <n> [--since_days <n>]` | Search Jianyu bid notices and keep only accessible detail links |
1010
| `opencli jianyu detail "<url>"` | Extract detail-page evidence blocks from a search URL |
1111

1212
## Usage Examples
@@ -15,8 +15,8 @@
1515
# Search by keyword
1616
opencli jianyu search "procurement" --limit 20 -f json
1717

18-
# Search another keyword with a smaller window
19-
opencli jianyu search "substation" --limit 10 -f json
18+
# Search another keyword with an explicit recency window
19+
opencli jianyu search "substation" --limit 10 --since_days 30 -f json
2020

2121
# Extract structured detail evidence
2222
opencli jianyu detail "https://www.jianyu360.cn/nologin/content/....html" -f json
@@ -29,10 +29,11 @@ opencli jianyu detail "https://www.jianyu360.cn/nologin/content/....html" -f jso
2929

3030
## Notes
3131

32-
- `search` now returns V2 fields: `publish_time`, `source_site`, `content_type`, `is_detail_page`, `snippet`, `quality_flags`, plus compatible `date/summary`.
32+
- `search` returns accessible procurement rows with `published_at`, `detail_status`, `project_code`, `budget_or_limit`, `url`, plus compatible `publish_time/date`.
33+
- `search` keeps all reachable rows by default. `--since_days` enables an explicit recency filter.
3334
- `detail` returns the same structured fields and adds `detail_text` + `evidence_blocks`.
3435
- Date fields are normalized to `YYYY-MM-DD` when date text is detectable.
35-
- Results are deduplicated by `title + url`.
36+
- Results are deduplicated by stable notice id when it is available.
3637
- `--limit` defaults to `20` and is capped at `50`.
3738

3839
## Troubleshooting

0 commit comments

Comments
 (0)