@@ -35,6 +35,10 @@ const NAVIGATION_PATH_PREFIXES = [
3535 '/exhibition/' ,
3636 '/swordfish/page_big_pc/search/' ,
3737] ;
38+ const BLOCKED_DETAIL_PATH_PREFIXES = [
39+ '/nologin/content/' ,
40+ '/article/bdprivate/' ,
41+ ] ;
3842const JIANYU_API_TYPES = [ 'fType' , 'eType' , 'vType' , 'mType' ] ;
3943export function buildSearchUrl ( query ) {
4044 const url = new URL ( SEARCH_ENTRY ) ;
@@ -74,6 +78,92 @@ function isLikelyNavigationUrl(rawUrl) {
7478 return true ;
7579 }
7680}
81+ function classifyDetailStatus ( rawUrl ) {
82+ const urlText = cleanText ( rawUrl ) ;
83+ if ( ! urlText ) {
84+ return {
85+ detail_status : 'blocked' ,
86+ detail_reason : 'missing_url' ,
87+ } ;
88+ }
89+ try {
90+ const parsed = new URL ( urlText ) ;
91+ const path = cleanText ( parsed . pathname ) . toLowerCase ( ) . replace ( / \/ + $ / , '/' ) || '/' ;
92+ if ( path . includes ( '/jybx/' ) ) {
93+ return {
94+ detail_status : 'ok' ,
95+ detail_reason : 'jybx_detail' ,
96+ } ;
97+ }
98+ if ( BLOCKED_DETAIL_PATH_PREFIXES . some ( ( prefix ) => path . includes ( prefix ) ) ) {
99+ return {
100+ detail_status : 'blocked' ,
101+ detail_reason : 'verification_or_paid_wall' ,
102+ } ;
103+ }
104+ if ( isLikelyNavigationUrl ( urlText ) ) {
105+ return {
106+ detail_status : 'entry_only' ,
107+ detail_reason : 'navigation_or_profile_entry' ,
108+ } ;
109+ }
110+ return {
111+ detail_status : 'entry_only' ,
112+ detail_reason : 'non_jybx_entry' ,
113+ } ;
114+ }
115+ catch {
116+ return {
117+ detail_status : 'blocked' ,
118+ detail_reason : 'invalid_url' ,
119+ } ;
120+ }
121+ }
122+ function extractNoticeId ( rawUrl ) {
123+ const value = cleanText ( rawUrl ) ;
124+ if ( ! value )
125+ return '' ;
126+ try {
127+ const parsed = new URL ( value ) ;
128+ const path = cleanText ( parsed . pathname ) ;
129+ const jybxMatched = path . match ( / \/ j y b x \/ ( [ ^ / ? # ] + ) \. h t m l $ / i) ;
130+ if ( jybxMatched ?. [ 1 ] )
131+ return cleanText ( jybxMatched [ 1 ] ) ;
132+ const segments = path . split ( '/' ) . filter ( Boolean ) ;
133+ const tail = cleanText ( segments [ segments . length - 1 ] || '' ) ;
134+ return cleanText ( tail . replace ( / \. h t m l ? $ / i, '' ) ) ;
135+ }
136+ catch {
137+ return '' ;
138+ }
139+ }
140+ function isWithinSinceDays ( dateText , sinceDays , now = new Date ( ) ) {
141+ const normalized = normalizeDate ( dateText ) ;
142+ if ( ! normalized )
143+ return false ;
144+ const timestamp = Date . parse ( `${ normalized } T00:00:00Z` ) ;
145+ if ( ! Number . isFinite ( timestamp ) )
146+ return false ;
147+ const today = Date . UTC ( now . getUTCFullYear ( ) , now . getUTCMonth ( ) , now . getUTCDate ( ) ) ;
148+ const deltaDays = Math . floor ( ( today - timestamp ) / ( 24 * 3600 * 1000 ) ) ;
149+ return deltaDays >= 0 && deltaDays <= sinceDays ;
150+ }
151+ function dedupeByNoticeKey ( items ) {
152+ const deduped = [ ] ;
153+ const seen = new Set ( ) ;
154+ for ( const item of items ) {
155+ const source = cleanText ( item . source_id || '' ) ;
156+ const notice = cleanText ( item . notice_id || '' ) ;
157+ const key = source && notice
158+ ? `${ source } \t${ notice } `
159+ : `${ cleanText ( item . title ) } \t${ cleanText ( item . url ) } ` ;
160+ if ( ! key || seen . has ( key ) )
161+ continue ;
162+ seen . add ( key ) ;
163+ deduped . push ( item ) ;
164+ }
165+ return deduped ;
166+ }
77167function filterNavigationRows ( query , items ) {
78168 const queryTokens = cleanText ( query ) . split ( / \s + / ) . filter ( Boolean ) . map ( ( token ) => token . toLowerCase ( ) ) ;
79169 return items
@@ -86,6 +176,9 @@ function filterNavigationRows(query, items) {
86176 . filter ( ( item ) => {
87177 if ( ! item . title || ! item . url )
88178 return false ;
179+ const detailSignal = classifyDetailStatus ( item . url ) ;
180+ if ( detailSignal . detail_status !== 'ok' )
181+ return false ;
89182 const haystack = `${ item . title } ${ item . contextText } ` . toLowerCase ( ) ;
90183 const hasQuery = queryTokens . length === 0 || queryTokens . some ( ( token ) => haystack . includes ( token ) ) ;
91184 const hasProcurementHint = PROCUREMENT_TITLE_HINT . test ( `${ item . title } ${ item . contextText } ` ) ;
@@ -446,11 +539,16 @@ cli({
446539 args : [
447540 { name : 'query' , required : true , positional : true , help : 'Search keyword, e.g. "procurement"' } ,
448541 { name : 'limit' , type : 'int' , default : 20 , help : 'Number of results (max 50)' } ,
542+ { name : 'since_days' , type : 'int' , help : 'Only keep rows published within N days' } ,
449543 ] ,
450- columns : [ 'rank' , 'content_type' , 'title' , 'publish_time ' , 'project_code' , 'budget_or_limit' , 'url' ] ,
544+ columns : [ 'rank' , 'content_type' , 'title' , 'published_at' , 'detail_status ', 'project_code' , 'budget_or_limit' , 'url' ] ,
451545 func : async ( page , kwargs ) => {
452546 const query = cleanText ( kwargs . query ) ;
453547 const limit = Math . max ( 1 , Math . min ( Number ( kwargs . limit ) || 20 , 50 ) ) ;
548+ const rawSinceDays = Number ( kwargs . since_days ) ;
549+ const sinceDays = Number . isFinite ( rawSinceDays ) && rawSinceDays > 0
550+ ? Math . max ( 1 , Math . min ( rawSinceDays , 3650 ) )
551+ : null ;
454552 const apiResult = await fetchJianyuApiRows ( page , query , limit ) ;
455553 const mergedRows = dedupeCandidates ( filterNavigationRows ( query , apiResult . rows ) ) ;
456554 const extractedRows = await searchRowsFromEntries ( page , {
@@ -465,21 +563,61 @@ cli({
465563 const indexedRows = await fetchDuckDuckGoIndexRows ( query , limit ) ;
466564 const filteredIndexedRows = dedupeCandidates ( filterNavigationRows ( query , indexedRows ) ) ;
467565 if ( filteredIndexedRows . length > 0 ) {
468- return toProcurementSearchRecords ( filteredIndexedRows , {
566+ const records = toProcurementSearchRecords ( filteredIndexedRows , {
469567 site : SITE ,
470568 query,
471569 limit,
472570 } ) ;
571+ const enriched = dedupeByNoticeKey ( records . map ( ( row ) => {
572+ const detailSignal = classifyDetailStatus ( row . url ) ;
573+ const publishedAt = normalizeDate ( row . publish_time || row . date ) ;
574+ return {
575+ ...row ,
576+ source_id : SITE ,
577+ notice_id : extractNoticeId ( row . url ) ,
578+ published_at : publishedAt ,
579+ detail_status : detailSignal . detail_status ,
580+ detail_reason : detailSignal . detail_reason ,
581+ } ;
582+ } ) )
583+ . filter ( ( row ) => row . detail_status === 'ok' )
584+ . filter ( ( row ) => sinceDays == null || isWithinSinceDays ( row . published_at , sinceDays ) )
585+ . slice ( 0 , limit )
586+ . map ( ( row , index ) => ( {
587+ ...row ,
588+ rank : index + 1 ,
589+ } ) ) ;
590+ return enriched ;
473591 }
474592 if ( apiResult . challenge || await isAuthRequired ( page ) ) {
475593 throw new AuthRequiredError ( DOMAIN , '[taxonomy=selector_drift] site=jianyu command=search blocked by human verification / access challenge' ) ;
476594 }
477595 }
478- return toProcurementSearchRecords ( rows , {
596+ const records = toProcurementSearchRecords ( rows , {
479597 site : SITE ,
480598 query,
481599 limit,
482600 } ) ;
601+ const enriched = dedupeByNoticeKey ( records . map ( ( row ) => {
602+ const detailSignal = classifyDetailStatus ( row . url ) ;
603+ const publishedAt = normalizeDate ( row . publish_time || row . date ) ;
604+ return {
605+ ...row ,
606+ source_id : SITE ,
607+ notice_id : extractNoticeId ( row . url ) ,
608+ published_at : publishedAt ,
609+ detail_status : detailSignal . detail_status ,
610+ detail_reason : detailSignal . detail_reason ,
611+ } ;
612+ } ) )
613+ . filter ( ( row ) => row . detail_status === 'ok' )
614+ . filter ( ( row ) => sinceDays == null || isWithinSinceDays ( row . published_at , sinceDays ) )
615+ . slice ( 0 , limit )
616+ . map ( ( row , index ) => ( {
617+ ...row ,
618+ rank : index + 1 ,
619+ } ) ) ;
620+ return enriched ;
483621 } ,
484622} ) ;
485623export const __test__ = {
@@ -494,4 +632,8 @@ export const __test__ = {
494632 normalizeApiRow,
495633 fetchJianyuApiRows,
496634 collectApiRowsFromResponses,
635+ classifyDetailStatus,
636+ extractNoticeId,
637+ isWithinSinceDays,
638+ dedupeByNoticeKey,
497639} ;
0 commit comments