From 530c67ca66cc0b3b909be0c3a200f4c05cbb462d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Jun 2026 06:29:17 +0000 Subject: [PATCH] Fix month normalization in arxiv_to_publications_correct.py (same issue as #1098/#1099) --- bin/arxiv_to_publications_correct.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/bin/arxiv_to_publications_correct.py b/bin/arxiv_to_publications_correct.py index a3ac8e1a..90490afb 100644 --- a/bin/arxiv_to_publications_correct.py +++ b/bin/arxiv_to_publications_correct.py @@ -2,12 +2,33 @@ import requests import argparse import fileinput +from html import unescape import bibtexparser from bibtexparser.bwriter import BibTexWriter from requests.exceptions import RequestException +def normalize_month_fields(bib): + def replace_month(match): + prefix, value = match.groups() + value = unescape(value).strip() + if value.startswith("{") and value.endswith("}"): + return match.group(0) + if value.startswith('"') and value.endswith('"'): + value = value[1:-1].strip() + value = value.strip("'\"{}") + value = re.sub(r"[^A-Za-z]", "", value) + return f"{prefix}{{{value}}}," + + return re.sub( + r"(^\s*month\s*=\s*)([^,\n]+)\s*,", + replace_month, + bib, + flags=re.MULTILINE | re.IGNORECASE, + ) + + def fetch_doi_content(url, accept_header, description): try: response = requests.get(url, headers={'Accept': accept_header}, timeout=30) @@ -83,6 +104,7 @@ def fetch_doi_content(url, accept_header, description): if id != id_db: print(f'Note: ID updated from {id_db} to {id} to reflect the publication year.') bib = "{".join([bType] + [','.join([id]+rest2)] + rest1[1:]) + bib = normalize_month_fields(bib) bib_db = bibtexparser.loads(bib) new_entries = bib_db.get_entry_list() if not new_entries: