Skip to content

Commit 740b403

Browse files
authored
[enhancement] Added ability to resume from cache (#217)
Added the ability to pause the tool and resume where you left off. Read the README for more details.
1 parent 4db21a0 commit 740b403

7 files changed

Lines changed: 96 additions & 19 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this p
66

77
To see tags and releases, please go to [Tags](https://github.com/AlexJSully/Publication-Figure-Retrieval/tags) on [GitHub](https://github.com/AlexJSully/Publication-Figure-Retrieval).
88

9+
## [3.0.1] - 2024-08-26
10+
11+
Feature:
12+
13+
- Re-added the ability to resume the process if it was canceled
14+
915
## [3.0.0] - 2024-08-25
1016

1117
The `Publication Figures Web Scraper` has been renamed to `Publication Figure Retrieval` as it no longer scrapes data from the web. Instead, it retrieves data from the NCBI API. This major change was done to comply with the NCBI's terms of service and policies.

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,19 @@ Then run
3030
npm install
3131
```
3232

33-
followed by
33+
### Running locally
34+
35+
To start and run the publication figure retrieval tool, run the following command:
3436

3537
```bash
36-
npm start
38+
npm run start
3739
```
3840

39-
This tool runs within your Node.js environment. On Windows, this script may need to run in administrator mode.
41+
If you chose to cancel this process at any time, you can resume and continue where you left off by running the same command. It will store the already processed PMC IDs in `build/output/cache/id.json`. To reset the cache, delete the `id.json` file.
4042

4143
### Usage
4244

43-
The images are downloaded locally within the `build/processor/output` directory.
45+
The images are downloaded locally within the `build/output` directory. They are organized by species then by publication ID.
4446

4547
### API Key
4648

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "publication-figure-retriever",
3-
"version": "3.0.0",
3+
"version": "3.0.1",
44
"description": "This tool provides a method for retrieving figures from NCBI's PubMed publications using NIH APIs for open access and publicly available publications.",
55
"main": "index.ts",
66
"scripts": {

src/processor/fetchArticleDetails.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import axios from "axios";
2+
import fs from "fs";
3+
import path from "path";
24
import { fetchArticleDetails } from "./fetchArticleDetails";
35
import { parseFigures } from "./parseFigures";
46

@@ -9,9 +11,27 @@ describe("fetchArticleDetails", () => {
911
const throttle = jest.fn((fn) => fn());
1012
const pmids = ["PMC123456", "PMC654321"];
1113
const species = "Homo sapiens";
14+
const cachedIDsFilePath = path.resolve(__dirname, "../output/cache/id.json");
1215

1316
beforeEach(() => {
1417
jest.clearAllMocks();
18+
const dir = path.dirname(cachedIDsFilePath);
19+
if (!fs.existsSync(dir)) {
20+
fs.mkdirSync(dir, { recursive: true });
21+
}
22+
if (fs.existsSync(cachedIDsFilePath)) {
23+
fs.unlinkSync(cachedIDsFilePath);
24+
}
25+
// Ensure the cached ID file is empty
26+
fs.writeFileSync(cachedIDsFilePath, JSON.stringify([]));
27+
});
28+
29+
afterEach(() => {
30+
// Delete the output directory
31+
const dir = path.dirname(cachedIDsFilePath);
32+
if (fs.existsSync(dir)) {
33+
fs.rmdirSync(dir, { recursive: true });
34+
}
1535
});
1636

1737
it("should fetch article details in batches and call parseFigures", async () => {
@@ -38,4 +58,21 @@ describe("fetchArticleDetails", () => {
3858

3959
consoleErrorSpy.mockRestore();
4060
});
61+
62+
it("should cache fetched IDs and skip already cached IDs", async () => {
63+
const mockResponse = { data: "<xml>mock data</xml>" };
64+
(axios.get as jest.Mock).mockResolvedValue(mockResponse);
65+
66+
// Initial fetch to cache the IDs
67+
await fetchArticleDetails(throttle, pmids, species);
68+
69+
expect(fs.existsSync(cachedIDsFilePath)).toBe(true);
70+
const cachedIDs = JSON.parse(fs.readFileSync(cachedIDsFilePath, "utf-8"));
71+
expect(cachedIDs).toEqual(pmids);
72+
73+
// Fetch again with the same IDs, should skip fetching
74+
await fetchArticleDetails(throttle, pmids, species);
75+
76+
expect(axios.get).toHaveBeenCalledTimes(1); // Should not call axios.get again
77+
});
4178
});

src/processor/fetchArticleDetails.ts

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import axios from "axios";
2+
import fs from "fs";
3+
import path from "path";
24
import { parseFigures } from "./parseFigures";
35

46
/**
@@ -19,34 +21,64 @@ import { parseFigures } from "./parseFigures";
1921
export async function fetchArticleDetails(
2022
/** The throttling function to control the rate of API requests. */
2123
throttle: any,
22-
/** An array of PMCIDs to fetch details for. */
24+
/** An array of PMC IDs to fetch details for. */
2325
pmids: string[],
2426
/** The species name to be used in the processing of figures. */
2527
species: string,
2628
): Promise<void> {
27-
/** Number of PMCIDs per batch. */
29+
/** Number of PMC IDs per batch. */
2830
const batchSize = 50;
2931

32+
// Grab cached IDs
33+
/** Path to the cached IDs file. */
34+
const cachedIDsFilePath = path.resolve(__dirname, "../output/cache/id.json");
35+
/** Cached IDs list. */
36+
let cachedIDs: string[] = [];
37+
// Check if the cached IDs file exists
38+
if (fs.existsSync(cachedIDsFilePath)) {
39+
const data = fs.readFileSync(cachedIDsFilePath, "utf-8");
40+
cachedIDs = JSON.parse(data);
41+
} else {
42+
// Create the directory if it doesn't exist
43+
fs.mkdirSync(path.dirname(cachedIDsFilePath), { recursive: true });
44+
}
45+
46+
// Get article details based on PMC IDs
3047
for (let i = 0; i < pmids.length; i += batchSize) {
31-
// Extract a batch of 50 PMCIDs
32-
/** A batch of 50 PMCIDs. */
48+
// Extract a batch of 50 PMC IDs
3349
const batch = pmids.slice(i, i + batchSize);
34-
/** Comma-separated list of PMCIDs. */
35-
const ids = batch.join(",");
36-
/** The URL to fetch article details for the current batch. */
50+
51+
// Filter out IDs that are already cached
52+
const newBatch = batch.filter((id) => !cachedIDs.includes(id));
53+
54+
if (newBatch.length === 0) {
55+
console.log(
56+
`All IDs in ${species.replace("_", " ")} batch ${i + 1}-${i + batch.length} are already cached.`,
57+
);
58+
59+
continue;
60+
}
61+
62+
/** Comma-separated string of PMC IDs for the batch. */
63+
const ids = newBatch.join(",");
64+
/** URL for fetching article details from the NCBI API. */
3765
let url = `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=${ids}&retmode=xml`;
38-
// Check if there is a NCBI API key available and if so, add it to the URL
66+
// Add the API key if available
3967
if (process?.env?.NCBI_API_KEY) {
4068
url += `&api_key=${process.env.NCBI_API_KEY}`;
4169
}
4270

43-
console.log(`Fetching article details for batch ${i + 1}-${i + batch.length}...`);
71+
console.log(
72+
`Fetching ${species.replace("_", " ")} article details for batch ${i + 1}-${i + newBatch.length}...`,
73+
);
4474

4575
try {
46-
// Make HTTP request to fetch article details in XML format for the current batch
47-
/** The response from the API request. */
4876
const response = await throttle(async () => await axios.get(url));
4977
await parseFigures(throttle, response.data, species);
78+
79+
// Add the new IDs to the cached list and write to the file
80+
cachedIDs.push(...newBatch);
81+
fs.writeFileSync(cachedIDsFilePath, JSON.stringify(cachedIDs, null, 2));
5082
} catch (error) {
5183
console.error("Error fetching article details:", error);
5284
}

src/processor/parseFigures.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ export async function parseFigures(
5656
// Download all figures for this article
5757
for (const url of figureUrls) {
5858
// Create the directory path for species and PMC ID
59-
const outputDir = path.join(__dirname, "output", species, pmcId);
59+
const outputDir = path.join(__dirname, "../output", species, pmcId);
6060
if (!fs.existsSync(outputDir)) {
6161
fs.mkdirSync(outputDir, { recursive: true });
6262
}

0 commit comments

Comments
 (0)