fix(charxiv): lazy-init OpenAI client and make model version configurable (#1252)

MaxwellJryao · Super User · web-flow · commit a130a0cb1c49 · 2026-03-15T17:09:07.000+08:00
- Replace module-level OpenAI client with lazy _get_client() to avoid
  SSLContext pickling errors in datasets.map() multiprocessing
- Remove num_proc=1 from dataset.map() (no longer needed)
- Add model parameter to get_reasoning_result_gpt() so MODEL_VERSION
  env var is respected for both descriptive and reasoning grading

Co-authored-by: Super User &lt;root@TENCENT64.site&gt;
diff --git a/lmms_eval/tasks/charxiv/reasoning_utils.py b/lmms_eval/tasks/charxiv/reasoning_utils.py
@@ -9,7 +9,7 @@
 )
 
 
-def get_reasoning_result_gpt(client, prompt, max_retries=10):
+def get_reasoning_result_gpt(client, prompt, model="gpt-4o-2024-05-13", max_retries=10):
     curr_retries = 0
     max_tokens = 256
     while curr_retries < max_retries:
@@ -22,7 +22,7 @@ def get_reasoning_result_gpt(client, prompt, max_retries=10):
                             "content": prompt,
                         }
                     ],
-                    model="gpt-4o-2024-05-13",
+                    model=model,
                     response_format={"type": "json_object"},
                     n=1,
                     max_tokens=max_tokens,
diff --git a/lmms_eval/tasks/charxiv/utils.py b/lmms_eval/tasks/charxiv/utils.py
@@ -22,7 +22,17 @@
 OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "YOUR_OPENAI_BASE_URL")
 MODEL_VERSION = os.getenv("MODEL_VERSION", "YOUR_MODEL_VERSION")
 
-client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+# Lazy-initialize the OpenAI client to avoid creating an SSLContext at import time.
+# An SSLContext cannot be pickled, which breaks dataset.map() multiprocessing even
+# with num_proc=1 on newer versions of the `datasets` library.
+_client = None
+
+
+def _get_client():
+    global _client
+    if _client is None:
+        _client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+    return _client
 
 
 def charxiv_reasoning_doc_to_text_cot(doc, lmms_eval_specific_kwargs=None):
@@ -53,7 +63,9 @@ def _process_row(example, indice):
         example["descriptive_a"] = example[f"descriptive_a{q_number}"]
         return {"qid": qid, **example}
 
-    dataset = dataset.map(_process_row, with_indices=True, num_proc=1)
+    # Use num_proc=None (single-process, no pickling) to avoid SSLContext
+    # serialization errors from the lazy OpenAI client in module globals.
+    dataset = dataset.map(_process_row, with_indices=True)
     return dataset
 
 
@@ -99,7 +111,7 @@ def charxiv_descriptive_aggregate_results(results):
     queries = build_descriptive_grading_queries(groups)
     combined_queries = []
     for query in tqdm(queries):
-        result = get_descriptive_result_gpt(client, query["grading_query"], len(query["resp_keys"]), model=MODEL_VERSION)
+        result = get_descriptive_result_gpt(_get_client(), query["grading_query"], len(query["resp_keys"]), model=MODEL_VERSION)
         # query contains resp_keys, grading_query, extract_answer and score
         combined_queries.append({**query, **result})
     queries = combined_queries
@@ -131,7 +143,7 @@ def charxiv_reasoning_aggregate_results(results):
         resps[result["resp_key"]] = result["resp_value"]
     queries = build_reasoning_grading_queries(data, resps)
     for figure_id, query in tqdm(queries.items()):
-        ext, scr = get_reasoning_result_gpt(client, query["grading_query"])
+        ext, scr = get_reasoning_result_gpt(_get_client(), query["grading_query"], model=MODEL_VERSION)
         queries[figure_id]["extracted_answer"] = ext
         queries[figure_id]["score"] = scr
         queries[figure_id].pop("grading_query")