add dimension - innovation pairwise rubric #46 from sciknoworg/dev

HamedBabaei · web-flow · commit 054437faed5d · 2026-04-30T14:52:32.000+02:00
diff --git a/README.md b/README.md
@@ -105,13 +105,13 @@ from yescieval.rubric.pointwise.structural import Coherence, Relevancy, Integrat
 from yescieval.rubric.pointwise.stylistic import Cohesion, Readability, Conciseness
 # Depth rubrics
 from yescieval.rubric.pointwise.depth import MechanisticUnderstanding, CausalReasoning, TemporalPrecision
-# Gap rubrics
+# Gap rubric
 from yescieval.rubric.pointwise.gap import GapIdentification
 # Breadth rubrics
 from yescieval.rubric.pointwise.breadth import ContextCoverage, MethodCoverage, DimensionCoverage, ScaleCoverage, ScopeCoverage
 # Rigor rubrics
 from yescieval.rubric.pointwise.rigor import EpistemicCalibration, QuantitativeEvidenceAndUncertainty, ExplicitUncertainty
-# Innovation rubrics
+# Innovation rubric
 from yescieval.rubric.pointwise.innovation import StateOfTheArtAndNovelty
 ```
 
@@ -124,8 +124,10 @@ from yescieval.rubric.pairwise.depth import MechanisticUnderstanding, CausalReas
 from yescieval.rubric.pairwise.breadth import ContextCoverage, MethodCoverage, DimensionCoverage, ScaleCoverage, ScopeCoverage
 # Rigor rubrics
 from yescieval.rubric.pairiwise.rigor import EpistemicCalibration, QuantitativeEvidenceAndUncertainty, ExplicitUncertainty
-# Gap rubrics
+# Gap rubric
 from yescieval.rubric.pairwise.gap import GapIdentification
+# Innovation rubric
+from yescieval.rubric.pairwise.innovation import StateOfTheArtAndNovelty
 ```
 
 A complete list of rubrics are available at YESciEval [📚 Rubrics](https://yescieval.readthedocs.io/rubrics.html) page.
diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst
@@ -45,8 +45,10 @@ Each rubric can be used in two ways:
         from yescieval.rubric.pairwise.breadth import ContextCoverage, MethodCoverage, DimensionCoverage, ScaleCoverage, ScopeCoverage
         # Pairwise rigor rubrics
         from yescieval.rubric.pairwise.rigor import EpistemicCalibration, QuantitativeEvidenceAndUncertainty, ExplicitUncertainty
-        # Pairwise Gap rubric
+        # Pairwise gap rubric
         from yescieval.rubric.pairwise.gap import GapIdentification
+        # Pairwise innovation rubric
+        from yescieval.rubric.pairwise.innovation import StateOfTheArtAndNovelty
 
 
 Pairwise Evaluation
@@ -426,7 +428,7 @@ Evaluates the novelty of the synthesis.
      - Is the answer identifying specific state-of-the-art and/or novel contributions
        relevant to the research question, using terms like "novel," "state-of-the-art"?
 
-.. tab:: Basic Usage
+.. tab:: Pointwise Usage
 
   .. code-block:: python
 
@@ -438,6 +440,18 @@ Evaluates the novelty of the synthesis.
      print(instruction)
      print(rubric.name)
 
+.. tab:: Pairwise Usage
+
+  .. code-block:: python
+
+     from yescieval.rubric.pairwise.innovation import StateOfTheArtAndNovelty
+
+     rubric      = StateOfTheArtAndNovelty(papers=papers, question=question, answer_a=answer_a, answer_b=answer_b)
+     instruction = rubric.instruct()
+
+     print(instruction)
+     print(rubric.name)
+  
 .. tab:: Usage with Example and Vocabulary Injectors
 
   .. code-block:: python
diff --git a/yescieval/injector/domains/ecology.py b/yescieval/injector/domains/ecology.py
@@ -511,7 +511,31 @@
                     }
                 ]
             }
-        }    
+        },
+        "Innovation": {
+            "StateOfTheArtAndNovelty": {
+                "ResponseA": [
+                    {
+                        "rating": "1",
+                        "rationale": "The response provides a general overview of ecological research and mentions common practices like field surveys and biodiversity monitoring. It does not identify any specific state-of-the-art methods or novel contributions and uses vague terms such as 'advanced techniques' without explaining their significance."
+                    },
+                    {
+                        "rating": "4",
+                        "rationale": "The response identifies specific state-of-the-art ecological approaches, such as the use of satellite remote sensing, LiDAR for vegetation structure analysis, and eDNA/metabarcoding for biodiversity assessment. It explains how these methods improve spatial resolution, allow non-invasive monitoring, and enhance detection of cryptic species, though comparisons to traditional baselines could be more detailed."
+                    }
+                ],
+                "ResponseB": [
+                    {
+                        "rating": "1",
+                        "rationale": "The response discusses ecological concepts like ecosystem balance and conservation strategies but does not mention any concrete state-of-the-art tools or innovative methods. It relies on generic statements and does not explain what is new or improved in current research."
+                    },
+                    {
+                        "rating": "4",
+                        "rationale": "The response highlights modern ecological innovations, including machine learning models for ecosystem prediction, high-frequency sensor networks for real-time monitoring, and integration of multi-source data (e.g., remote sensing and field data). It explains how these approaches improve predictive accuracy and temporal coverage, although it provides limited detail on specific benchmarks or comparisons."
+                    }
+                ]
+            }
+        }   
     }
 }
 
diff --git a/yescieval/injector/domains/nlp.py b/yescieval/injector/domains/nlp.py
@@ -517,6 +517,30 @@
                     }
                 ]
             }
+        },
+        "Innovation": {      
+            "StateOfTheArtAndNovelty": {
+                "ResponseA": [
+                    {
+                        "rating": "1",
+                        "rationale": "The response provides a general overview of NLP techniques such as tokenization, word embeddings, and neural networks. It does not mention any specific state-of-the-art models or recent innovations and uses vague terms like 'modern approaches' without explaining what makes them new or how they improve performance."
+                    },
+                    {
+                        "rating": "4",
+                        "rationale": "The response identifies concrete state-of-the-art NLP contributions, including transformer-based architectures, instruction tuning, and parameter-efficient fine-tuning methods such as LoRA. It explains how these approaches reduce training cost, improve adaptability to downstream tasks, and enable efficient deployment, although it lacks detailed comparison with earlier baselines."
+                    }
+                ],
+                "ResponseB": [
+                    {
+                        "rating": "1",
+                        "rationale": "The response discusses NLP applications like machine translation and sentiment analysis but does not identify any novel methods or recent advancements. It relies on generic descriptions and mentions 'cutting-edge models' without specifying what they are or what improvements they bring."
+                    },
+                    {
+                        "rating": "4",
+                        "rationale": "The response highlights specific innovations such as retrieval-augmented generation (RAG), reinforcement learning from human feedback (RLHF/DPO), and multimodal models that combine text and images. It explains how these methods improve factual grounding, alignment with user intent, and cross-modal understanding, though the explanation of trade-offs and benchmarks is somewhat limited."
+                    }
+                ]
+            }
         }           
     }
 }
diff --git a/yescieval/rubric/pairwise/innovation/__init__.py b/yescieval/rubric/pairwise/innovation/__init__.py
@@ -0,0 +1,3 @@
+from .novelty import StateOfTheArtAndNovelty
+
+__all__ = ["StateOfTheArtAndNovelty"]
diff --git a/yescieval/rubric/pairwise/innovation/novelty.py b/yescieval/rubric/pairwise/innovation/novelty.py
@@ -0,0 +1,85 @@
+from ....base import PairwiseRubric
+
+state_of_the_art_and_novelty_pairwise_prompt = """<Context>
+Scientific question answering and synthesis often require more than listing findings: high-quality scientific writing can surface what is genuinely innovative in the literature and explain how it differs from prior or established approaches. In synthesis settings (e.g., reports summarizing multiple papers), this is expressed by identifying specific novel contributions (e.g., new methods, new datasets, new capabilities, new theoretical framings, proof-of-concept results) and situating them relative to an implicit or explicit baseline (what was done before, what limitation is addressed, what capability is newly enabled).
+
+This rubric is not about using buzzwords like “breakthrough” or “state-of-the-art” in isolation. High scores require novelty to be concrete and meaningfully contextualized (new relative to what, and why it matters). Not every research question requires emphasizing novelty; some primarily ask for established consensus or background. In such cases, it can be appropriate to focus on established knowledge, though a strong response may still indicate whether the field is mature versus rapidly evolving.
+
+The responses may be short paragraphs or long-form reports. Innovation should be evaluated independently of presentation style or length.
+
+This rubric focuses exclusively on the presence and quality of innovation identification within the provided text of two responses that are compared—i.e., whether the responses highlight specific novel contributions and explains their significance relative to prior work—rather than merely summarizing established knowledge or using generic novelty language. Other aspects of scientific quality (such as factual accuracy, evidential grounding, completeness, or mechanistic depth) are intentionally outside its scope and are assessed by separate evaluation criteria.
+</Context>
+
+<Role>
+You are tasked as a scientific writing quality evaluator performing a pairwise comparison between two texts.
+</Role>
+
+<Task-Description>
+A user will provide:
+1) a research question, and
+2) two written responses (Response A and Response B) intended to address that question.
+
+Your task is to:
+- First, independently evaluate each response using the evaluation characteristic below.
+- Then perform a pairwise comparison of the two responses using the evaluation characteristic below.
+- Then grade each response with a comparative rating from 1 (very bad) to 5 (very good) compared to the other response and a subsequent rationale for each comparative response rating.
+- Note that it is possible for both responses to receive the same rating, if they are equally comparably clear with consistent innovation identification and provide complementary or identical insights.
+
+Your judgment must be based solely on the provided question and comparing the two responses w.r.t. addressing the question and w.r.t. each other.
+</Task-Description>
+
+<Evaluation-Characteristic>
+StateOfTheArtAndNovelty: Does the response identify specific state-of-the-art and/or novel contributions relevant to the research question (e.g., new methods, datasets, capabilities, theoretical framings, proof-of-concept results), and meaningfully contextualize them relative to prior or established work (i.e., new relative to what, and why it matters)? If novelty emphasis is not central to the question, does the response avoid forced novelty and (optionally) state that the evidence base is mature or that innovation is not the focus?
+</Evaluation-Characteristic>
+
+<Domain-Vocabulary-Examples>
+Below are terms and phrases that often co-occur with innovation claims. They are examples only: their presence is not required, and their presence alone is not sufficient for a high score.
+{NOVELTY_INDICATORS_VOCAB}
+</Domain-Vocabulary-Examples>
+
+<Rating-Scale>
+For the characteristic above, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below.
+
+StateOfTheArtAndNovelty
+Rating 1. Very bad: The response provides only established/background knowledge or a generic summary, with no identification of specific state-of-the-art or novel contributions where such identification would be relevant; or it uses novelty buzzwords (“breakthrough”, “SOTA”) without any concrete explanation.
+Rating 2. Bad: The response occasionally signals state-of-the-art or novelty, but claims are vague, generic, or weakly connected to the research question; novelty is not contextualized (no clear “new relative to what”) and/or seems forced.
+Rating 3. Moderate: The response identifies at least one potentially state-of-the-art or novel contribution, but description, relevance, or significance is partially unclear; contextualization relative to prior work is limited or inconsistent; proof-of-concept vs established advances may be conflated.
+Rating 4. Good: The response clearly highlights multiple specific state-of-the-art and/or innovative contributions relevant to the research question and provides reasonable contextualization (what limitation is addressed or what capability is newly enabled), with minor gaps in baseline comparison or scope.
+Rating 5. Very good: The response provides a coherent, well-structured account of state-of-the-art and novelty tightly aligned with the research question: it identifies multiple specific novel contributions, clearly explains how each differs from prior/established approaches (explicit or implicit baseline), and articulates why it matters (capabilities, limitations addressed, or new directions), while appropriately scoping claims (e.g., proof-of-concept vs broadly validated). If novelty emphasis is not central to the question, it avoids forced novelty and explicitly frames the maturity/innovation relevance appropriately.
+
+</Rating-Scale>
+
+<Response-Format>
+Return your evaluation strictly in JSON format:
+
+{
+  "StateOfTheArtAndNovelty": {
+    "ResponseA": {
+      "rating": "",
+      "rationale": ""
+    },
+    "ResponseB": {
+      "rating": "",
+      "rationale": ""
+    }
+  }
+}
+
+where:
+- "rating" is a number from 1 to 5.
+- "rationale" is the comparative evaluation rating justification.
+
+</Response-Format>
+
+<Example-Responses>
+{EXAMPLE_RESPONSES}
+</Example-Responses>
+
+<Note>
+Your evaluation must be based solely on the provided research question and responses. Do not reward novelty buzzwords alone. Reward specific identification of what is new, clear contextualization relative to prior work (“new compared to what”), and appropriate scoping of innovation claims. This rubric does not assess factual correctness, evidential grounding, completeness, or mechanistic depth.
+</Note>
+"""
+
+class StateOfTheArtAndNovelty(PairwiseRubric):
+    name: str = "StateOfTheArtAndNovelty"
+    system_prompt_template: str = state_of_the_art_and_novelty_pairwise_prompt

Original file line number	Diff line number	Diff line change
`@@ -517,6 +517,30 @@`
`517`	`517`	`}`
`518`	`518`	`]`
`519`	`519`	`}`
	`520`	`+ },`
	`521`	`+ "Innovation": {`
	`522`	`+ "StateOfTheArtAndNovelty": {`
	`523`	`+ "ResponseA": [`
	`524`	`+ {`
	`525`	`+ "rating": "1",`
	`526`	`+ "rationale": "The response provides a general overview of NLP techniques such as tokenization, word embeddings, and neural networks. It does not mention any specific state-of-the-art models or recent innovations and uses vague terms like 'modern approaches' without explaining what makes them new or how they improve performance."`
	`527`	`+ },`
	`528`	`+ {`
	`529`	`+ "rating": "4",`
	`530`	`+ "rationale": "The response identifies concrete state-of-the-art NLP contributions, including transformer-based architectures, instruction tuning, and parameter-efficient fine-tuning methods such as LoRA. It explains how these approaches reduce training cost, improve adaptability to downstream tasks, and enable efficient deployment, although it lacks detailed comparison with earlier baselines."`
	`531`	`+ }`
	`532`	`+ ],`
	`533`	`+ "ResponseB": [`
	`534`	`+ {`
	`535`	`+ "rating": "1",`
	`536`	`+ "rationale": "The response discusses NLP applications like machine translation and sentiment analysis but does not identify any novel methods or recent advancements. It relies on generic descriptions and mentions 'cutting-edge models' without specifying what they are or what improvements they bring."`
	`537`	`+ },`
	`538`	`+ {`
	`539`	`+ "rating": "4",`
	`540`	`+ "rationale": "The response highlights specific innovations such as retrieval-augmented generation (RAG), reinforcement learning from human feedback (RLHF/DPO), and multimodal models that combine text and images. It explains how these methods improve factual grounding, alignment with user intent, and cross-modal understanding, though the explanation of trade-offs and benchmarks is somewhat limited."`
	`541`	`+ }`
	`542`	`+ ]`
	`543`	`+ }`
`520`	`544`	`}`
`521`	`545`	`}`
`522`	`546`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .novelty import StateOfTheArtAndNovelty`
	`2`	`+`
	`3`	`+__all__ = ["StateOfTheArtAndNovelty"]`