various fixes to VSA, disable graph pipeline for now

clovis · clovis · commit ec5bac5e9388 · 2026-03-13T14:37:48.000-05:00
diff --git a/lib/pyproject.toml b/lib/pyproject.toml
@@ -24,7 +24,8 @@ dependencies = [
     "pystemmer",
     "lxml",
     "namedlist",
-    "sentence-transformers~=5.1",
+    "sentence-transformers~=5.3",
+    "transformers~=4.51.0",
     "lz4",
     "orjson",
     "text_preprocessing @ git+https://github.com/ARTFL-Project/text-preprocessing@v1.1.1.3#egg=text_preprocessing",
@@ -39,7 +40,6 @@ dependencies = [
     "ahocorasick-rs",
     "msgspec",
     "faiss-cpu",
-    "spacy-transformers",
     "networkx~=3.5",
     "torch-geometric~=2.7.0",
     "umap-learn~=0.5.9",
@@ -54,7 +54,7 @@ Documentation = "https://github.com/ARTFL-Project/text-pair#readme"
 "Bug Tracker" = "https://github.com/ARTFL-Project/text-pair/issues"
 
 [project.scripts]
-textpair = "textpair.__main__:main"
+textpair = "textpair.__main__:run"
 
 [project.optional-dependencies]
 cpu = [
diff --git a/lib/textpair.egg-info/entry_points.txt b/lib/textpair.egg-info/entry_points.txt
@@ -1,2 +1,2 @@
 [console_scripts]
-textpair = textpair.__main__:main
+textpair = textpair.__main__:run
diff --git a/lib/textpair/__main__.py b/lib/textpair/__main__.py
@@ -3,6 +3,7 @@
 
 import configparser
 import os
+import shutil
 import subprocess
 import sys
 
@@ -302,10 +303,10 @@ async def run_alignment(params):
     groups_file = merge_alignments(results_file, count)
 
     if params.web_app_config["skip_web_app"] is False:
-        # Build graph model and generate cluster labels
-        print(f"\n### Building Thematic Identity Graph model ###")
-        embedding_model = params.preprocessing_params["source"]["embedding_model"]
-        build_graph_and_labels(results_file, embedding_model, params.llm_params)
+        # Graph pipeline disabled for now
+        # print(f"\n### Building Thematic Identity Graph model ###")
+        # embedding_model = params.preprocessing_params["source"]["embedding_model"]
+        # build_graph_and_labels(results_file, embedding_model, params.llm_params)
 
         create_web_app(
             results_file,
@@ -383,9 +384,9 @@ async def run_vsa_similarity(params) -> None:
         output_file = os.path.join(params.output_path, "results/alignments.jsonl.lz4")
         count = get_count(os.path.join(params.output_path, "results/counts.txt"))
 
-        # Build graph model and generate cluster labels
-        embedding_model = params.preprocessing_params["source"]["embedding_model"]
-        build_graph_and_labels(output_file, embedding_model, params.llm_params)
+        # Graph pipeline disabled for now
+        # embedding_model = params.preprocessing_params["source"]["embedding_model"]
+        # build_graph_and_labels(output_file, embedding_model, params.llm_params)
 
         create_web_app(
             output_file,
@@ -407,6 +408,13 @@ async def run_vsa_similarity(params) -> None:
 async def main():
     """Main entry point for the textpair CLI."""
     params = get_config()
+
+    # Save a copy of the config file to the output directory for reproducibility
+    config_file = params.config
+    if config_file and os.path.exists(config_file):
+        os.makedirs(params.output_path, exist_ok=True)
+        shutil.copy2(config_file, os.path.join(params.output_path, f"{params.dbname}_config.ini"))
+
     if params.delete is True:
         delete_database(params.dbname)
     elif params.update_db is True:
@@ -462,7 +470,12 @@ async def main():
         await run_vsa_similarity(params)
 
 
-if __name__ == "__main__":
+def run():
+    """Sync entry point for console_scripts."""
     import asyncio
 
     asyncio.run(main())
+
+
+if __name__ == "__main__":
+    run()
diff --git a/lib/textpair/vector_space_alignment/__init__.py b/lib/textpair/vector_space_alignment/__init__.py
@@ -240,7 +240,7 @@ def generate_merged_groups():
         return count
 
     # Perform iterative merging with streaming
-    while last_count / current_count <= 1.0:
+    while last_count / current_count < 1.0:
         last_count = current_count
 
         # Alternate between temp databases
diff --git a/lib/textpair/vector_space_alignment/corpus.py b/lib/textpair/vector_space_alignment/corpus.py
@@ -453,7 +453,7 @@ def sim_function(x, y):
         )
 
         if model is None:
-            self.model = SentenceTransformer(model_name, trust_remote_code=False)
+            self.model = SentenceTransformer(model_name)
         else:
             self.model = model
 
@@ -484,7 +484,7 @@ def create_embeddings(self, text_chunks) -> torch.Tensor:
         tensor = self.model.encode(
             list(text_chunks),
             convert_to_tensor=True,
-            batch_size=512,
+            batch_size=32,
             show_progress_bar=False,
             normalize_embeddings=True,
         )
diff --git a/lib/textpair/vector_space_alignment/expansion.py b/lib/textpair/vector_space_alignment/expansion.py
@@ -328,8 +328,6 @@ async def expand_validated_matches(
     expansion_candidates = []
     final_matches = []
 
-    print("Identifying expansion candidates...", flush=True)
-
     for match in matches:
         source_sents = count_sentences_from_tokens(
             match.source.metadata["parsed_filename"],
@@ -354,31 +352,20 @@ async def expand_validated_matches(
     total_candidates = len(expansion_candidates)
 
     if expansion_candidates:
-        print(
-            f"Processing {total_candidates} expansion candidates in chunks of {chunk_size}...",
-            flush=True,
-        )
-
-        with tqdm(
-            total=total_candidates,
-            desc="Looking for potential passage expansions",
-            leave=False,
-        ) as pbar:
+        with tqdm(total=total_candidates, desc="Expanding short passages", unit="passage") as pbar:
             for i in range(0, total_candidates, chunk_size):
                 chunk = expansion_candidates[i : i + chunk_size]
                 chunk_expansion_count = await _process_expansion_chunk(chunk, evaluator)
                 expansion_count += chunk_expansion_count
+                pbar.update(len(chunk))
 
-                # Add processed matches to final results
                 for match, _, _ in chunk:
                     final_matches.append(match)
 
-                pbar.update(len(chunk))
-
-    print(
-        f"Looking for potential passage expansions: expanded {expansion_count} passages.",
-        flush=True,
-    )
+        print(
+            f"Expansion complete: {expansion_count}/{total_candidates} passages expanded.",
+            flush=True,
+        )
 
     return final_matches
 
@@ -408,8 +395,8 @@ async def _process_expansion_chunk(chunk: list[tuple[MergedGroup, int, int]], ev
     prev_pairs = [(exp["source_text"], exp["target_text"]) for exp in step1_prev_expansions]
     next_pairs = [(exp["source_text"], exp["target_text"]) for exp in step1_next_expansions]
 
-    prev_results = await evaluator.evaluate_batch(prev_pairs, batch_size=8)
-    next_results = await evaluator.evaluate_batch(next_pairs, batch_size=8)
+    prev_results = await evaluator.evaluate_batch(prev_pairs, batch_size=8, show_progress=False)
+    next_results = await evaluator.evaluate_batch(next_pairs, batch_size=8, show_progress=False)
 
     # --- Step 2: Determine winners and prepare next step ---
     step2_candidates = []
@@ -459,8 +446,8 @@ async def _process_expansion_chunk(chunk: list[tuple[MergedGroup, int, int]], ev
         )
         expansion_count += 1
 
-        # Prepare step 2 expansion
-        step2_candidates.append(_prepare_expansion_step(original_match, step=2, direction=step2_direction))
+        # Prepare step 2 expansion (+1 more sentence beyond what step 1 already added)
+        step2_candidates.append(_prepare_expansion_step(original_match, step=1, direction=step2_direction))
         step2_directions.append(step2_direction)
         step2_match_map.append(original_match)
 
@@ -469,7 +456,7 @@ async def _process_expansion_chunk(chunk: list[tuple[MergedGroup, int, int]], ev
 
     # --- Step 3: Evaluate step 2 expansions ---
     step2_pairs = [(exp["source_text"], exp["target_text"]) for exp in step2_candidates]
-    step2_results = await evaluator.evaluate_batch(step2_pairs, batch_size=8)
+    step2_results = await evaluator.evaluate_batch(step2_pairs, batch_size=8, show_progress=False)
 
     for i, (step2_score, _, _) in enumerate(step2_results):
         original_match = step2_match_map[i]
diff --git a/lib/textpair/vector_space_alignment/structures.py b/lib/textpair/vector_space_alignment/structures.py
@@ -304,13 +304,22 @@ def __init__(self, matches: Iterable[MergedGroup]):
     def match_generator(self, new_matches):
         for match in new_matches:
             dump = ENCODER.encode(match)
-            yield (self.count, dump)
+            yield (
+                self.count,
+                dump,
+                match.source.filename,
+                match.target.filename,
+                match.source.start_byte,
+                match.source.start_byte - match.source.end_byte,
+                match.target.start_byte,
+                match.target.start_byte - match.target.end_byte,
+            )
             self.count += 1
 
     def extend(self, new_matches: Iterable[MergedGroup]):
         """Add new matches to existing matches"""
         encoded_matches = self.match_generator(new_matches)
-        self.cursor.executemany("INSERT INTO matches VALUES (?, ?)", encoded_matches)
+        self.cursor.executemany("INSERT INTO matches VALUES (?, ?, ?, ?, ?, ?, ?, ?)", encoded_matches)
 
     def __save(self, matches):
         count = 0
diff --git a/lib/textpair_llm/textpair_llm/llm_evaluation.py b/lib/textpair_llm/textpair_llm/llm_evaluation.py
@@ -156,7 +156,7 @@ def stop_server(self):
             self.server_process = None
 
     async def evaluate_batch(
-        self, passage_pairs: list[tuple[str, str]], batch_size: int = 8
+        self, passage_pairs: list[tuple[str, str]], batch_size: int = 8, show_progress: bool = True
     ) -> list[tuple[float, str, str]]:
         """
         Evaluate multiple passage pairs concurrently
@@ -203,7 +203,7 @@ async def evaluate_single(session, source_text, target_text):
         results = []
         total_pairs = len(passage_pairs)
 
-        with tqdm(total=total_pairs, desc="LLM Evaluation", unit="pairs", leave=False) as pbar:
+        with tqdm(total=total_pairs, desc="LLM Evaluation", unit="pairs", leave=False, disable=not show_progress) as pbar:
             for i in range(0, len(passage_pairs), batch_size):
                 batch = passage_pairs[i : i + batch_size]
 
@@ -425,21 +425,30 @@ def _parse_llm_response(self, response: str) -> tuple[float, str, str]:
                     stance = stance_match.group(1).strip().capitalize()
                     break
 
-            # Try multiple score patterns
+            # Try multiple score patterns — ordered from most to least specific
             score_patterns = [
                 r"Score:\s*([0-9]*\.?[0-9]+)",  # "Score: 0.8"
                 r"score:\s*([0-9]*\.?[0-9]+)",  # "score: 0.8" (lowercase)
-                r"([0-9]*\.?[0-9]+)",  # Just a number anywhere
             ]
 
             score = 0.0
+            score_found = False
             for pattern in score_patterns:
                 score_match = re.search(pattern, response, re.IGNORECASE)
                 if score_match:
                     score = float(score_match.group(1))
                     score = max(0.0, min(1.0, score))  # Clamp to valid range
+                    score_found = True
                     break
 
+            # Last resort: find a decimal number (0.XX) that looks like a score,
+            # but only match numbers with a decimal point to avoid grabbing years or counts
+            if not score_found:
+                fallback_match = re.search(r"\b(0\.\d+|1\.0+)\b", response)
+                if fallback_match:
+                    score = float(fallback_match.group(1))
+                    score = max(0.0, min(1.0, score))
+
             return score, reasoning, stance
 
         except Exception as e:
@@ -521,13 +530,14 @@ def create_similarity_evaluation_prompt(source_text: str, target_text: str, cont
     First, determine if the passages address the same specific argument. Then, use the score guide below.
 
     IMPORTANT: Direct agreement and direct disagreement on the exact same point are both forms of HIGH similarity.
-    IMPORTANT: Avoid defaulting to the boundary scores of a category (like 0.40, 0.70, or 0.90). Use the full range to show nuance.
 
     Score Guide:
-    • 0.0 - 0.4: Different Subjects. The passages are about completely different topics.
-    • > 0.4 to < 0.7: Shared Subject, Different Focus. The passages are about the same broad subject (e.g., the Roman Empire) but focus on different specific arguments or aspects (e.g., one is about military tactics, the other about trade policy).
-    • 0.7 - 0.9: Shared Subject, Shared Focus. The passages address the exact same specific argument, question, or thesis. They are in direct conversation, whether they agree, disagree, or analyze it in parallel.
-    • > 0.9 - 1.0: Paraphrase. The passages make the exact same point and have nearly identical meaning.
+    • 0.00 - 0.40: Different Subjects. The passages are about completely different topics.
+    • 0.41 - 0.69: Shared Subject, Different Focus. Same broad subject (e.g., the Roman Empire) but different specific arguments (e.g., military tactics vs. trade policy).
+    • 0.70 - 0.79: Same Argument, Loose Connection. The passages address the same question or thesis but from meaningfully different angles, evidence bases, or time periods. The intellectual link is real but indirect.
+    • 0.80 - 0.89: Same Argument, Clear Engagement. The passages directly address the same specific point with overlapping evidence, reasoning, or rhetorical framing. A reader would immediately see they are in conversation.
+    • 0.90 - 0.95: Same Argument, Near-Paraphrase Framing. Very close in both content and rhetorical approach — similar structure, similar evidence, similar conclusions — but not word-for-word identical.
+    • 0.96 - 1.00: True Paraphrase. The passages make the exact same point with nearly identical meaning. Only surface wording differs.
 
     Your thought process:
     1. What is the broad subject of each passage?
@@ -536,7 +546,9 @@ def create_similarity_evaluation_prompt(source_text: str, target_text: str, cont
        - If they share the specific argument, do they Agree or Disagree?
        - If they only share the broad subject, mark as Neutral.
        - Otherwise, mark as Unrelated.
-    4. Based on that, which score category do they fall into?
+    4. Within the matching score category, calibrate precisely:
+       - Low end of range: weaker fit for that category's description.
+       - High end of range: strong fit, almost belongs in the next category up.
 
     Provide your answer in this exact format:
     Reasoning: [Your step-by-step analysis - keep concise, 2-3 sentences]

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`[console_scripts]`
`2`		`-textpair = textpair.__main__:main`
	`2`	`+textpair = textpair.__main__:run`