Merge pull request #1107 from Parallel-in-Time/copilot/fix-ci-failure

pancetta · web-flow · commit 0a5cc094e6c3 · 2026-06-15T08:17:52.000+02:00
Fix arxivbot workflow: guard against non-BibTeX DOI responses and shell injection
diff --git a/.github/workflows/arxiv_to_publications_correct.yml b/.github/workflows/arxiv_to_publications_correct.yml
@@ -9,12 +9,14 @@ jobs:
       - uses: actions/checkout@v4
       - name: Look for bibtex entries that now have a DOI
         if: github.event.label.name == 'food for arxivbot'
+        env:
+          ISSUE_BODY: ${{ github.event.issue.body }}
         run: |
           cd bin
           python3 -m pip install --user --upgrade pip
           python3 -m pip install --user setuptools
-          python3 -m pip install --user bibtexparser
-          python3 arxiv_to_publications_correct.py -b "${{ github.event.issue.body }}" > comment.out 2>&1
+          python3 -m pip install --user requests bibtexparser
+          python3 arxiv_to_publications_correct.py -b "$ISSUE_BODY" > comment.out 2>&1
           {
             echo 'COMMENT<<GITHUB_OUTPUT_DELIMITER'
             cat comment.out
diff --git a/bin/arxiv_to_publications_correct.py b/bin/arxiv_to_publications_correct.py
@@ -32,15 +32,14 @@ def fetch_doi_content(url, accept_header, description):
 
     for url, id_db in zip(doi_list, id_list):
         print(f'Working on {id_db} with URL {url}')
-        req = fetch_doi_content(url, 'application/x-bibtex', 'BibTeX')
-        if req is None:
+        bibtex_req = fetch_doi_content(url, 'application/x-bibtex', 'BibTeX')
+        if bibtex_req is None:
             continue
-        bib = req.content.decode()
-        req = fetch_doi_content(url, 'application/json', 'metadata')
-        if req is None:
+        meta_req = fetch_doi_content(url, 'application/json', 'metadata')
+        if meta_req is None:
             continue
         try:
-            data = req.json()
+            data = meta_req.json()
         except ValueError as exc:
             print(f'Ignoring {url}, invalid metadata response: {exc}\n\n')
             continue
@@ -62,26 +61,40 @@ def fetch_doi_content(url, accept_header, description):
         if entries[id_db]["ENTRYTYPE"] != 'unpublished':
             print(f'Ignoring {id_db}, original entry in bib file was not unpublished.\n\n')
             continue
-        db.entries.remove(entries[id_db])
-
-        # Check for duplicate keys in the remaining database and add letter suffixes if needed
-        remaining = db.get_entry_dict()
-        id_orig = id
-        letters = 'bcdefghijklmnopqrstuvwxyz'
-        i = 0
-        while id in remaining:
-            print(f'Key {id} already exists, augmenting with letter suffix.')
-            id = id_orig + letters[i]
-            i += 1
 
-        if id != id_db:
-            print(f'Note: ID updated from {id_db} to {id} to reflect the publication year.')
+        # Parse the BibTeX and replace the key before modifying the database
+        try:
+            bib = bibtex_req.text
+            bType, *rest1 = bib.split("{")
+            if not rest1:
+                print(f'Ignoring {id_db}, DOI did not return valid BibTeX (no opening brace found).\n\n')
+                continue
+            oldID, *rest2 = rest1[0].split(",")
+            # Check for duplicate keys in the remaining database and add letter suffixes if needed
+            remaining = db.get_entry_dict()
+            del remaining[id_db]  # exclude the entry being replaced from duplicate check
+            id_orig = id
+            letters = 'bcdefghijklmnopqrstuvwxyz'
+            i = 0
+            while id in remaining:
+                print(f'Key {id} already exists, augmenting with letter suffix.')
+                id = id_orig + letters[i]
+                i += 1
+            if id != id_db:
+                print(f'Note: ID updated from {id_db} to {id} to reflect the publication year.')
+            bib = "{".join([bType] + [','.join([id]+rest2)] + rest1[1:])
+            bib_db = bibtexparser.loads(bib)
+            new_entries = bib_db.get_entry_list()
+            if not new_entries:
+                print(f'Ignoring {id_db}, could not parse BibTeX returned by DOI.\n\n')
+                continue
+        except Exception as exc:
+            print(f'Ignoring {id_db}, error processing BibTeX from DOI: {exc}\n\n')
+            continue
 
-        bType, *rest1 = bib.split("{")
-        oldID, *rest2 = rest1[0].split(",")
-        bib = "{".join([bType] + [','.join([id]+rest2)] + rest1[1:])
-        bib_db = bibtexparser.loads(bib)
-        db.entries.extend(bib_db.get_entry_list())
+        # Only mutate the database once we have a valid replacement entry
+        db.entries.remove(entries[id_db])
+        db.entries.extend(new_entries)
 
     if id_list:
         writer = BibTexWriter()