Integrate OpenAI support to extract the Package URL (PURL), as well as the affected and fixed versions.

ziadhany · ziadhany · commit 466d884878c6 · 2025-04-29T18:01:03.000+03:00
Signed-off-by: ziad hany &lt;ziadhany2016@gmail.com&gt;
diff --git a/vulnerabilities/improvers/ai_summary_version.py b/vulnerabilities/improvers/ai_summary_version.py
@@ -1,83 +1,116 @@
-import json
-import re
-from pathlib import Path
 from typing import Iterable
+from typing import List
 
-import chromadb
 from django.db.models import QuerySet
-from langchain_chroma import Chroma
-from langchain_ollama import OllamaLLM
-
+from pydantic import BaseModel
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIModel
+from pydantic_ai.providers.openai import OpenAIProvider
 from univers.version_range import RANGE_CLASS_BY_SCHEMES
 
-from vulnerabilities.importer import AffectedPackage, AdvisoryData
-from vulnerabilities.improver import Inference, MAX_CONFIDENCE, Improver
+from vulnerabilities.importer import AdvisoryData
+from vulnerabilities.importer import AffectedPackage
+from vulnerabilities.improver import MAX_CONFIDENCE
+from vulnerabilities.improver import Improver
+from vulnerabilities.improver import Inference
 from vulnerabilities.improvers.default import get_exact_purls
 from vulnerabilities.models import Advisory
 from vulnerablecode.settings import env
-from langchain.prompts import PromptTemplate
 from packageurl import PackageURL
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.document_loaders import UnstructuredMarkdownLoader
-from tqdm import tqdm
+from pydantic.functional_validators import field_validator
+
+class Purl(BaseModel):
+    string: str
+
+    @field_validator('string')
+    def check_valid_purl(cls, v: str) -> str:
+        try:
+            PackageURL.from_string(v)
+        except Exception as e:
+            raise ValueError(f"Invalid PURL '{v}': {e}")
+        return v
+
+class Versions(BaseModel):
+    affected_versions: List[str]
+    fixed_versions:   List[str]
+
+
+prompt_purl_extraction = f"""
+You are a highly specialized Vulnerability Analysis Assistant. Your task is to analyze the provided vulnerability summary or package name and extract a single valid Package URL (PURL) that conforms to the official PURL specification:
+
+**Component Definitions (Required by PURL Specification):**
+- **scheme**: Constant value `pkg`
+- **type**: Package type or protocol (e.g., maven, npm, nuget, gem, pypi, rpm, etc.) — must be a known valid type
+- **namespace**: A name prefix such as a Maven groupId, Docker image owner, or GitHub user/org (optional and type-specific)
+- **name**: Package name (required)
+- **version**: Version of the package (optional)
+- **qualifiers**: Extra data like OS, arch, etc. (optional and type-specific)
+- **subpath**: Subpath within the package (optional)
+
+**Examples of Valid PURLs:**
+- pkg:maven/org.apache.apr/apr-util@1.3.5
+- pkg:github/apache/apr-util@1.3.5
+- pkg:rpm/redhat/apr-util@1.3.5
+- pkg:deb/debian/apr-util@1.3.5
+
+**Output Instructions:**
+- Identify the most appropriate and valid PURL type for the package if possible.
+- If a valid and complete PURL can be constructed, return only:
+  `{{ "string": "pkg:type/namespace/name@version?qualifiers#subpath" }}`
+- If no valid PURL can be constructed or the type is unknown, return:
+  `{{}}`
+- Do not include any other output (no explanation, formatting, or markdown).
+"""
+
+prompt_version_extraction = f"""
+        You are a highly specialized Vulnerability Analysis Assistant. Your task is to analyze the following vulnerability summary and accurately extract the affected and fixed versions of the software.
+        
+        Instructions:
+        - Affected Version: Use one of the following formats:
+          - >= <version>, <= <version>, > <version>, < <version>
+          - A specific range like <version1> - <version2>
+        - Fixed Version: Use one of the following formats:
+          - >= <version>, <= <version>, > <version>, < <version>
+          - "Not Fixed" if no fixed version is mentioned.
+        - Ensure accuracy by considering different ways affected and fixed versions might be described in the summary.
+        - Extract only version-related details without adding any extra information.
+        
+        Output Format:
+        ```json
+        {{
+            "affected_versions": ["<version_condition>", "<version_condition>"],
+            "fixed_versions": ["<version_condition>", "<version_condition>"]
+        }}
+        ```
+        Example:
+        {{
+            "affected_versions": [">=1.2.3", "<2.0.0"],
+            "fixed_versions": ["2.0.0"]
+        }}
+        
+        Return only the JSON object without any additional text.
+        """
 
 class AISummaryImprover(Improver):
     """
     A pipeline for improving vulnerability version extraction using AI.
     This pipeline analyzes vulnerability summaries and extracts affected and fixed versions.
     """
 
-    llm = OllamaLLM(
-        model=env.str("OLLAMA_MODEL_NAME"),
-        base_url=env.str("OLLAMA_BASE_URL")
-    )
-
-    # Initialize embeddings
-    embeddings = HuggingFaceEmbeddings(
-        model_name="sentence-transformers/all-MiniLM-L6-v2",
-        model_kwargs={"device": "cpu"},
-        encode_kwargs={"normalize_embeddings": True},
-    )
-
-    # Initialize ChromaDB Client (do this once)
-    chroma_client = chromadb.PersistentClient(path="purl_index")
-
-    # Create the vector store using LangChain's Chroma integration
-    vector_db = Chroma(
-        client=chroma_client,
-        collection_name="purl_embeddings",
-        embedding_function=embeddings,
-    )
-
-    # Check if collection exists and contains documents
-    existing_docs = vector_db.get()
-    if existing_docs and existing_docs.get("documents"):
-        print(f"✅ ChromaDB collection loaded successfully! {len(existing_docs['documents'])} documents found.")
-    else:
-        print(f"⚠️ Collection not found or empty. Initializing ChromaDB.")
-
-        # Load documents
-        markdown_path = "/agent/purl_db/PURL.rst"
-        loader = UnstructuredMarkdownLoader(markdown_path)
-        docs = loader.load()  # This returns a list of Documents
-
-        if not docs:
-            print("❌ No documents loaded. Please check the file path and format.")
-        else:
-            print(f"✅ Loaded {len(docs)} documents.")
-            collection = chroma_client.get_or_create_collection(name="purl_embeddings")
-
-            # Index each document by its file name
-            for i, doc in enumerate(tqdm(docs, desc="Indexing documents")):
-                file = doc.metadata.get("source", "unknown")
-                file_name = Path(file).stem
-                collection.add(
-                    ids=[file_name],
-                    documents=[doc.page_content],
-                    metadatas=[{"file_name": file_name}],
-                )
+    openai_model = OpenAIModel('gpt-4o-mini', provider=OpenAIProvider(api_key=env.str("OPENAI_API_KEY")))
+
+    # ollama_model = OpenAIModel(
+    #     model_name=env.str("OLLAMA_MODEL_NAME"), provider=OpenAIProvider(openai_client=env.str("OLLAMA_BASE_URL"))
+    # )
+
+    purl_agent = Agent(openai_model,
+                       system_prompt=prompt_purl_extraction,
+                       output_type=Purl)
+
+    versions_agent = Agent(openai_model,
+                           system_prompt=prompt_version_extraction,
+                           output_type=Versions)
 
-            print("✅ Documents indexed in ChromaDB.")
 
     @property
     def interesting_advisories(self) -> QuerySet:
@@ -86,27 +119,24 @@ def interesting_advisories(self) -> QuerySet:
         )
 
     def get_inferences(self, advisory_data: AdvisoryData) -> Iterable[Inference]:
-        """
-        """
         if not advisory_data:
             return []
 
         if advisory_data.summary:
             purl = self.handler_purl(advisory_data.summary)
 
-            if not purl:
-                return
-
-            affected_version_range, fixed_version = self.handler_version_ranges(summary=advisory_data.summary,
-                                                                                supported_ecosystem=purl.type)
+            affected_version_range, fixed_version = self.handler_version_ranges(
+                summary=advisory_data.summary,
+                supported_ecosystem=purl.type
+            )
 
             affected_package = AffectedPackage(
-                package=purl,
+                package=PackageURL(type=purl.type, namespace=purl.namespace, name=purl.name),
                 affected_version_range=affected_version_range,
                 fixed_version=fixed_version,
             )
-            affected_purls, fixed_purls = get_exact_purls(affected_package)
 
+            affected_purls, fixed_purls = get_exact_purls(affected_package)
             for fixed_purl in fixed_purls:
                 yield Inference(
                     aliases=advisory_data.aliases,
@@ -120,50 +150,14 @@ def get_inferences(self, advisory_data: AdvisoryData) -> Iterable[Inference]:
 
 
     def handler_version_ranges(self, summary, supported_ecosystem):
-        """
-        """
-        version_extraction_prompt = PromptTemplate(
-            input_variables=["summary"],
-            template="""
-            You are a highly specialized Vulnerability Analysis Assistant. Your task is to analyze the following vulnerability summary and accurately extract the affected and fixed versions of the software.
-
-            **Vulnerability Summary:**
-            {summary}
-
-            Output Format:
-            ```json
-            {{
-                "affected_versions": ["<version_condition>", "<version_condition>"],
-                "fixed_versions": ["<version_condition>", "<version_condition>"]
-            }}
-            ```
-            
-            Instructions:
-            - Affected Version: Use one of the following formats:
-              - >= <version>, <= <version>, > <version>, < <version>
-              - A specific range like <version1> - <version2>
-            - Fixed Version: Use one of the following formats:
-              - >= <version>, <= <version>, > <version>, < <version>
-              - "Not Fixed" if no fixed version is mentioned.
-            - Ensure accuracy by considering different ways affected and fixed versions might be described in the summary.
-            - Extract only version-related details without adding any extra information.
-
-            Return only the JSON object without any additional text.
-            """,
-        )
-
-        version_extraction_prompt = version_extraction_prompt.format(summary=summary)
-        json_text = self.get_llm_result(prompt=version_extraction_prompt)
-
-        try:
-            match = re.search(r'```json\n(.*?)\n```', json_text, re.DOTALL).group(1)
-            json_data = json.loads(match)
-        except json.JSONDecodeError as e:
-            print("Invalid JSON:", e)
-            json_data = {}
+        """Extract affected and fixed version ranges from a vulnerability summary."""
+        result = self.versions_agent.run_sync(user_prompt=f"""
+        **Vulnerability Summary:**
+        {summary}
+        """)
 
-        affected_version_ranges = json_data.get("affected_versions", [])
-        fixed_version_ranges = json_data.get("fixed_versions", [])
+        affected_version_ranges = result.output.affected_versions
+        fixed_version_ranges = result.output.fixed_versions
 
         affected_version_objs = [RANGE_CLASS_BY_SCHEMES[supported_ecosystem].from_string(f"vers:{supported_ecosystem}/" + affected_version_range) for affected_version_range in affected_version_ranges]
         fixed_version_objs = [RANGE_CLASS_BY_SCHEMES[supported_ecosystem].from_string(f"vers:{supported_ecosystem}/" + fixed_version_version_range) for fixed_version_version_range in fixed_version_ranges]
@@ -172,46 +166,11 @@ def handler_version_ranges(self, summary, supported_ecosystem):
 
     def handler_purl(self, summary):
         """
+        Analyze the vulnerability summary and extract a valid Package URL (PURL).
+        Returns the extracted PURL string or None if not found.
         """
-        purl_extraction_prompt = PromptTemplate(
-            input_variables=["summary"],
-            template="""
-        You are a highly specialized Vulnerability Analysis Assistant. Your task is to analyze the provided vulnerability summary, and extract a single valid Package URL (PURL) that strictly conforms to the following specification:
-           
-        **Vulnerability Summary:**  
+        result = self.purl_agent.run_sync(user_prompt=f"""
+        **Vulnerability Summary:**
         {summary}
-        
-        **Component Definitions:**
-        - **scheme:** Must be the constant value `pkg` (required).
-        - **type:** The package type or protocol (e.g., maven, npm, nuget, gem, pypi, etc.) (required).
-        - **namespace:** A name prefix such as a Maven groupId, Docker image owner, or GitHub user/organization (optional and type-specific).
-        - **name:** The package name (required).
-        - **version:** The version of the package (optional).
-        - **qualifiers:** Extra qualifying data such as an OS, architecture, distro, etc. (optional and type-specific).
-        - **subpath:** A subpath within the package, relative to the package root (optional).
-        
-        **Important Requirements:**
-        - The components must form a hierarchy from the most significant (left) to the least significant (right).
-        - The PURL must NOT contain a URL authority (i.e., no username, password, host, or port).
-        - If a namespace segment resembles a host, its interpretation is specific to the package type.
-        
-        **Output Instructions:**
-        - If a valid PURL is extracted, return **only** the PURL (and nothing else).
-        - If no valid PURL is found, return nothing.
-        Provide the answer strictly based on the above context.
-            """,
-        )
-        # single_doc_content = self.vector_db.get()["documents"][0]
-        purl_extraction_prompt = purl_extraction_prompt.format(summary=summary)
-                                                               #context=single_doc_content)
-        llm_response = self.get_llm_result(prompt=purl_extraction_prompt)
-        purl_response = re.search(r'pkg:[a-zA-Z0-9._-]+(?:/[a-zA-Z0-9._-]+)+', llm_response).group(0)
-        return PackageURL.from_string(purl_response)
-
-    def get_llm_result(self, prompt):
-        """
-        """
-        response = self.llm.invoke(prompt)
-        cleaned_result = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL).strip()
-        print(cleaned_result)
-        return cleaned_result
+        """)
+        return PackageURL.from_string(result.output.string)