Merge pull request #1001 from juanjemdIos/fix-529

dgarijo · web-flow · commit 9d0a024582ef · 2026-06-24T16:39:07.000+02:00
solve problem with long headers and improve header analisys confidence. Fixes #529, Fixes #138
diff --git a/README.md b/README.md
@@ -95,6 +95,19 @@ We recognize the following properties:
 
 We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/)
 
+### Confidence values in header analysis
+
+When extracting metadata using header analysis, SOMEF assigns a confidence value based on the length 
+of the header. Shorter headers are more likely to be a good fit for a category, while longer headers 
+may contain additional context that makes the classification less reliable:
+
+| Header length | Confidence |
+|---------------|------------|
+| 1–3 words     | 1.0        |
+| 4–6 words     | 0.8        |
+| 7–10 words    | 0.5        |
+| 11+ words     | 0.1        |
+
 ## Documentation
 
 See full documentation at [https://somef.readthedocs.io/en/latest/](https://somef.readthedocs.io/en/latest/)
diff --git a/docs/output.md b/docs/output.md
@@ -133,6 +133,19 @@ The following table summarized the properties used to describe a `category`:
 | **source** | No | Url | URL of the source file used for the extraction. |
 | **technique** | Yes | String | Technique used for the extraction. One of the following list: Supervised classification, header analysis, regular expression, GitHub API, File exploration, Code parsing |
 
+### Confidence values in header analysis
+
+When extracting metadata using header analysis, SOMEF assigns a confidence value based on the length 
+of the header. Shorter headers are more likely to be a good fit for a category, while longer headers 
+may contain additional context that makes the classification less reliable:
+
+| Header length | Confidence |
+|---------------|------------|
+| 1–3 words     | 1.0        |
+| 4–6 words     | 0.8        |
+| 7–10 words    | 0.5        |
+| 11+ words     | 0.1        |
+
 ### Result
 Field returning the extracted output from the code repository. An example can be seen below for a citation found in BibteX format in a README file of a code repository:
 
@@ -446,6 +459,7 @@ The table below summarizes the mapping between the SOMEF internal JSON structure
 | `logo`                | `logo`                  | Project logo URL |
 | `maintainer`          | `maintainer`            | Project maintainers |
 | `name`                | `name`                  | Software name |
+| `schema:owner`        | `owner`                  | Software owner |
 | `programmingLanguage` | `programming_languages` | Languages used |
 | `readme`              | `readme_url`            | README file URL |
 | `referencePublication`| `citation` (Papers)     || References to the main publication associated with this software component (as per author preference) *1*|
diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
@@ -329,11 +329,20 @@ def is_false_positive_header(text: str, category: str) -> bool:
 
     text_lower = text.lower()
 
+    if '?' in text or '!' in text:
+        return True
+    
     # false positives for bibliographic citations
     if category == constants.CAT_CITATION:
         for pattern in constants.NEGATIVE_PATTERNS_CITATION_HEADERS:
             if pattern in text_lower:
                 return True
+
+    if category in constants.MAX_HEADER_WORDS:
+        num_words = len(text.split())
+        if num_words > constants.MAX_HEADER_WORDS[category]:
+            return True
+        
     return False
 
 
@@ -431,6 +440,13 @@ def extract_categories(repo_data: str, repository_metadata: Result, similarity_t
         df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup']
         df = df.drop(columns=['ParentGroup'])
 
+        # Installation keywords that wordnet cannot handle correctly
+        mask = df['Group'].str.len() == 0
+        df.loc[mask, 'Group'] = df.loc[mask, 'Header'].map(
+            lambda h: [constants.CAT_INSTALLATION]
+            if any(kw in h.lower() for kw in constants.INSTALLATION_HEADER_KEYWORDS)
+            else []
+        )
         # detection for os/platform headers that wordnet cannot handle correctly
         mask = df['Group'].str.len() == 0
         df.loc[mask, 'Group'] = df.loc[mask, 'Header'].map(
@@ -494,6 +510,7 @@ def extract_categories(repo_data: str, repository_metadata: Result, similarity_t
             if row[constants.PROP_PARENT_HEADER]:
                 result[constants.PROP_PARENT_HEADER] = row[constants.PROP_PARENT_HEADER]
 
+            confidence = calculate_header_confidence(row[constants.PROP_ORIGINAL_HEADER])
             if row['Group'] == constants.CAT_LICENSE:
                 license_text = row[constants.PROP_VALUE]
                 license_info = detect_license_spdx(license_text, 'HEADER')
@@ -507,7 +524,7 @@ def extract_categories(repo_data: str, repository_metadata: Result, similarity_t
             repository_metadata.add_result(
                 row['Group'],
                 result,
-                1,
+                confidence,
                 constants.TECHNIQUE_HEADER_ANALYSIS,
                 source,
             )
@@ -613,6 +630,15 @@ def build_wordnet_groups() -> Dict[str, List]:
     return g
 
 
+def calculate_header_confidence(header: str) -> float:
+    """Returns a confidence value based on the header length."""
+    num_words = len(header.split())
+    for max_words, confidence in constants.HEADER_CONFIDENCE_THRESHOLDS:
+        if num_words <= max_words:
+            return confidence
+    return 0.1
+  
+  
 def extract_os_from_content(text: str) -> List[dict]:
     """
     Scans a text block for mentions of operating systems, platforms or runtime
@@ -655,4 +681,4 @@ def extract_os_from_content(text: str) -> List[dict]:
                     "value": name,
                 })
 
-    return results
+    return results
diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py
@@ -531,12 +531,15 @@ def test_issue_417(self):
         json_content = json.loads(data)
         issue_tracker = json_content["issueTracker"]  # JSON is in Codemeta format
      
-        #len(json_content["citation"]) 
-        #codemeta category citation is now referencePublication
+
+        # buildInstructions was previously generated from the header "Browser issues (Why can't I see
+        # the generated documentation / visualization?)" which was incorrectly classified as documentation
+        # due to the word "documentation" in the header. This was a false positive fixed in issue #529
+        # (long headers with punctuation are now discarded), so buildInstructions is no longer expected here.
+        # len(json_content["buildInstructions"]) > 0 and 
         assert issue_tracker == 'https://github.com/dgarijo/Widoco/issues' and len(json_content["referencePublication"]) > 0 and \
             len(json_content["name"]) > 0 and len(json_content["identifier"]) > 0 and \
             len(json_content["description"]) > 0 and len(json_content["readme"]) > 0 and \
-            len(json_content["buildInstructions"]) > 0 and \
             len(json_content["softwareRequirements"]) > 0 and len(json_content["programmingLanguage"]) > 0 and \
             len(json_content["keywords"]) > 0 and len(json_content["logo"]) > 0 and \
             len(json_content["license"]) > 0 and len(json_content["dateCreated"]) > 0
diff --git a/src/somef/test/test_header_analysis.py b/src/somef/test/test_header_analysis.py
@@ -151,6 +151,54 @@ def test_extract_headers_with_separators(self):
             assert 'Funding' in headers
 
 
+    def test_issue_529(self):
+        """
+        Test that ensures long headers or headers with punctuation are not incorrectly
+        classified. 'Browser issues (Why can't I see...)' should not appear in documentation.
+        """
+        with open(test_data_path + "widoco_readme.md", "r") as data_file:
+            file_text = data_file.read()
+            json_test, results = extract_categories(file_text, Result())
+            if constants.CAT_DOCUMENTATION in json_test.results:
+                headers = [e[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "")
+                        for e in json_test.results[constants.CAT_DOCUMENTATION]]
+                assert not any("Browser issues" in h for h in headers)
+
+
+    def test_issue_529_installation(self):
+        """
+        Test that ensures long headers or headers with punctuation are not incorrectly
+        classified. 'Importing WIDOCO as a dependency' should not appear in documentation.
+        """
+        with open(test_data_path + "widoco_readme.md", "r") as data_file:
+            file_text = data_file.read()
+            json_test, results = extract_categories(file_text, Result())
+
+            assert constants.CAT_INSTALLATION in json_test.results, "No installation category found"
+            # print(json_test.results[constants.CAT_INSTALLATION])
+            if constants.CAT_INSTALLATION in json_test.results:
+                headers = [e[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "")
+                        for e in json_test.results[constants.CAT_INSTALLATION]]
+
+                assert any("Importing WIDOCO as a dependency" in h for h in headers)
+
+    def test_issue_138(self):
+        """
+        Test that ensures header analysis returns a confidence lower than 1
+        for long headers (4+ words).
+        """
+        with open(test_data_path + "widoco_readme.md", "r") as data_file:
+            file_text = data_file.read()
+            json_test, results = extract_categories(file_text, Result())
+            for category, entries in json_test.results.items():
+                if category == constants.PROP_PROVENANCE:
+                    continue
+                for entry in entries:
+                    if entry[constants.PROP_TECHNIQUE] == constants.TECHNIQUE_HEADER_ANALYSIS:
+                        header = entry[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "")
+                        if header and len(header.split()) > 3:
+                            print(f"Header: '{header}' | words: {len(header.split())} | confidence: {entry[constants.PROP_CONFIDENCE]}")
+                            assert entry[constants.PROP_CONFIDENCE] < 1.0
     def test_issue_112_similarity_threshold(self):
         """
         Checks that the similarity_threshold parameter is respected in header analysis.
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
@@ -547,6 +547,22 @@ class RepositoryType(Enum):
 DEPENDENCY_TYPE_RUNTIME = "runtime"
 DEPENDENCY_TYPE_DEVELOPMENT = "development"
 
+# same length for all categories or different length depending on the category????????.
+# This is used in the header analysis technique, to determine how many words from the header should be included in the analysis
+# and avoid including false positives.
+MAX_HEADER_WORDS = {
+    CAT_DOCUMENTATION: 5,
+    CAT_REQUIREMENTS: 3,
+    CAT_CITATION: 5,
+}
+
+# Confidence thresholds for header analysis based on header length
+HEADER_CONFIDENCE_THRESHOLDS = [
+    (3, 1.0),   # 1-3 words -> confidence 1.0
+    (6, 0.8),   # 4-6 words -> confidence 0.8
+    (10, 0.5),  # 7-10 words -> confidence 0.5
+    (11, 0.1),  # 11+ words -> confidence 0.1
+]
 # in case not exist in config file. But config file has higher priority than this default value.
 CONF_SIMILARITY_THRESHOLD = "similarity_threshold"
 CONF_DEFAULT_SIMILARITY_THRESHOLD = 0.8
@@ -561,6 +577,20 @@ class RepositoryType(Enum):
     "supported platforms", "tested on", "runs on", "environment",
 ]
 
+INSTALLATION_HEADER_KEYWORDS = [
+    "importing",
+    "downloading",
+    "download",
+    "as a dependency",
+    "as dependency",
+    "via pip",
+    "via conda",
+    "via npm",
+    "via maven",
+    "getting started",
+    "quick start",
+    "quickstart",
+]
 # Regular expressions for OS/platform detection in header analysis
 REGEXP_OS_WINDOWS = r'(?i)\bwindows\s*(\d[\d.]*\d|\d+)?'
 REGEXP_OS_MACOS = r'(?i)(?:\bmacos|\bmac\s*os|\bos\s*x|\bosx)\s*([\d.]+)?'