-
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfetchPackageUrl.ts
More file actions
133 lines (117 loc) · 4.12 KB
/
fetchPackageUrl.ts
File metadata and controls
133 lines (117 loc) · 4.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
* Fetch PMC article package URL from the OA Web Service API
*
* NCBI has migrated image serving to a new CDN infrastructure where direct image URLs
* cannot be constructed. Instead, images must be downloaded from article package files
* available via the PMC Open Access FTP service.
*
* @see https://pmc.ncbi.nlm.nih.gov/tools/oa-service/
* @see https://pmc.ncbi.nlm.nih.gov/tools/ftp/
*/
import axios from "axios";
import * as xml2js from "xml2js";
const OA_API_BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi";
export interface PackageInfo {
pmcId: string;
tgzUrl?: string;
pdfUrl?: string;
license?: string;
citation?: string;
retracted?: string;
}
/**
* Fetch package URL for a PMC article from the OA Web Service API
*
* @param pmcId - The PMC ID (with or without "PMC" prefix)
* @returns Package information including tar.gz URL containing images
* @throws Error if the API request fails or article is not in Open Access subset
*/
export async function fetchPackageUrl(pmcId: string): Promise<PackageInfo> {
// Ensure PMC ID has the "PMC" prefix
const pmcIdWithPrefix = pmcId.startsWith("PMC") ? pmcId : `PMC${pmcId}`;
try {
const response = await axios.get(OA_API_BASE_URL, {
params: { id: pmcIdWithPrefix },
timeout: 10000,
});
const parser = new xml2js.Parser({ explicitArray: false });
const result = await parser.parseStringPromise(response.data);
// Check if we got an error response
if (result.OA?.error) {
throw new Error(`OA API error: ${result.OA.error}`);
}
// Extract record information
const records = result.OA?.records?.record;
if (!records) {
throw new Error(`Article ${pmcIdWithPrefix} not found in Open Access subset`);
}
// Handle both single record and array of records
const record = Array.isArray(records) ? records[0] : records;
const packageInfo: PackageInfo = {
pmcId: pmcIdWithPrefix,
license: record.$?.license,
citation: record.$?.citation,
retracted: record.$?.retracted,
};
// Extract download links
if (record.link) {
const links = Array.isArray(record.link) ? record.link : [record.link];
for (const link of links) {
if (link.$?.format === "tgz") {
// Convert FTP URL to HTTPS URL for better compatibility
const originalUrl = link.$?.href;
const httpsUrl = originalUrl?.replace("ftp://", "https://");
if (httpsUrl && httpsUrl !== originalUrl) {
console.log(`Converted FTP to HTTPS: ${originalUrl} -> ${httpsUrl}`);
}
packageInfo.tgzUrl = httpsUrl;
} else if (link.$?.format === "pdf") {
const originalUrl = link.$?.href;
const httpsUrl = originalUrl?.replace("ftp://", "https://");
if (httpsUrl && httpsUrl !== originalUrl) {
console.log(`Converted FTP to HTTPS: ${originalUrl} -> ${httpsUrl}`);
}
packageInfo.pdfUrl = httpsUrl;
}
}
}
if (!packageInfo.tgzUrl) {
throw new Error(
`No downloadable package found for ${pmcIdWithPrefix}. Article may not be in Open Access subset.`,
);
}
return packageInfo;
} catch (error) {
if (axios.isAxiosError(error)) {
throw new Error(`Failed to fetch package URL for ${pmcIdWithPrefix}: ${error.message}`);
}
throw error;
}
}
/**
* Batch fetch package URLs for multiple PMC articles
* Note: OA API doesn't support batch requests, so this makes individual requests
* with throttling to respect rate limits
*
* @param pmcIds - Array of PMC IDs
* @param delayMs - Delay between requests in milliseconds (default: 334ms = ~3 req/sec)
* @returns Array of package information
*/
export async function fetchPackageUrlsBatch(pmcIds: string[], delayMs: number = 334): Promise<PackageInfo[]> {
const results: PackageInfo[] = [];
for (let i = 0; i < pmcIds.length; i++) {
const pmcId = pmcIds[i];
try {
const packageInfo = await fetchPackageUrl(pmcId);
results.push(packageInfo);
} catch (error) {
console.error(`Error fetching package URL for ${pmcId}:`, error);
// Continue with other PMC IDs even if one fails
}
// Add delay between requests to respect rate limits
if (i < pmcIds.length - 1) {
await new Promise((resolve) => setTimeout(resolve, delayMs));
}
}
return results;
}