Skip to content

Commit a3f4a70

Browse files
committed
Fix link extraction from baidu
1 parent 66f200d commit a3f4a70

File tree

1 file changed

+54
-3
lines changed

1 file changed

+54
-3
lines changed

Tool/Sources/WebSearchService/SearchServices/HeadlessBrowserSearchService.swift

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ struct HeadlessBrowserSearchService: SearchService {
3838
case .google:
3939
return try GoogleSearchResultParser.parse(html: html)
4040
case .baidu:
41-
return BaiduSearchResultParser.parse(html: html)
41+
return await BaiduSearchResultParser.parse(html: html)
4242
case .duckDuckGo:
4343
return DuckDuckGoSearchResultParser.parse(html: html)
4444
case .bing:
@@ -85,8 +85,58 @@ enum BaiduSearchResultParser {
8585
static func validate(document: SwiftSoup.Document) -> Bool {
8686
return (try? document.select("#content_left").first()) != nil
8787
}
88+
89+
static func getRealLink(from baiduLink: String) async -> String {
90+
guard let url = URL(string: baiduLink) else {
91+
return baiduLink
92+
}
8893

89-
static func parse(html: String) -> WebSearchResult {
94+
let config = URLSessionConfiguration.default
95+
config.httpShouldSetCookies = true
96+
config.httpCookieAcceptPolicy = .always
97+
98+
var request = URLRequest(url: url)
99+
request.httpMethod = "GET"
100+
request.addValue(
101+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
102+
forHTTPHeaderField: "User-Agent"
103+
)
104+
105+
let redirectCapturer = RedirectCapturer()
106+
let session = URLSession(
107+
configuration: config,
108+
delegate: redirectCapturer,
109+
delegateQueue: nil
110+
)
111+
112+
do {
113+
let _ = try await session.data(for: request)
114+
115+
if let finalURL = redirectCapturer.finalURL {
116+
return finalURL.absoluteString
117+
}
118+
119+
return baiduLink
120+
} catch {
121+
return baiduLink
122+
}
123+
}
124+
125+
class RedirectCapturer: NSObject, URLSessionTaskDelegate {
126+
var finalURL: URL?
127+
128+
func urlSession(
129+
_ session: URLSession,
130+
task: URLSessionTask,
131+
willPerformHTTPRedirection response: HTTPURLResponse,
132+
newRequest request: URLRequest,
133+
completionHandler: @escaping (URLRequest?) -> Void
134+
) {
135+
finalURL = request.url
136+
completionHandler(request)
137+
}
138+
}
139+
static func parse(html: String) async -> WebSearchResult {
90140
let document = try? SwiftSoup.parse(html)
91141
let elements = try? document?.select("#content_left").first()?.children()
92142

@@ -97,6 +147,7 @@ enum BaiduSearchResultParser {
97147
let link = try? element.select("a").attr("href"),
98148
link.hasPrefix("http")
99149
{
150+
let realLink = await getRealLink(from: link)
100151
let title = (try? titleElement.text()) ?? ""
101152
let snippet = {
102153
let abstract = try? element.select("div[data-module=\"abstract\"]").text()
@@ -106,7 +157,7 @@ enum BaiduSearchResultParser {
106157
return (try? titleElement.nextElementSibling()?.text()) ?? ""
107158
}()
108159
results.append(WebSearchResult.WebPage(
109-
urlString: link,
160+
urlString: realLink,
110161
title: title,
111162
snippet: snippet
112163
))

0 commit comments

Comments
 (0)