SoftwareUnderstanding
diff --git a/‎.idea/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.idea/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/metacheck/cli.py‎
Lines changed: 14 additions & 10 deletions b/‎src/metacheck/cli.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎src/metacheck/detect_pitfalls_main.py‎
Lines changed: 1 addition & 1 deletion b/‎src/metacheck/detect_pitfalls_main.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/metacheck/scripts/pitfalls/p005.py‎
Lines changed: 0 additions & 1 deletion b/‎src/metacheck/scripts/pitfalls/p005.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/metacheck/scripts/pitfalls/p006.py‎
Lines changed: 1 addition & 3 deletions b/‎src/metacheck/scripts/pitfalls/p006.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p007.py‎
Lines changed: 33 additions & 50 deletions b/‎src/metacheck/scripts/pitfalls/p007.py‎
Lines changed: 33 additions & 50 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p009.py‎
Lines changed: 15 additions & 10 deletions b/‎src/metacheck/scripts/pitfalls/p009.py‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎src/metacheck/scripts/pitfalls/p010.py‎
Lines changed: 12 additions & 13 deletions b/‎src/metacheck/scripts/pitfalls/p010.py‎
Lines changed: 12 additions & 13 deletions
@@ -2,7 +2,7 @@
 
 setup(
     name="metacheck",
-    version="0.1.0",
+    version="0.1.1",
     package_dir={"": "src"},
     packages=find_namespace_packages(where="src"),
     install_requires=[
 
@@ -1,7 +1,7 @@
 import argparse
 import os
 from pathlib import Path
-from metacheck.run_somef import run_somef_batch
+from metacheck.run_somef import run_somef_batch, run_somef_single
 from metacheck.run_analyzer import run_analysis
 
 
@@ -11,7 +11,7 @@ def cli():
         "--input",
         nargs="+",
         required=True,
-        help="One or more JSON files containing repositories (e.g., GitHub, GitLab) OR existing SoMEF output files when using --skip-somef."
+        help="One or more: GitHub/GitLab URLs, JSON files containing repositories, OR existing SoMEF output files when using --skip-somef."
     )
     parser.add_argument(
         "--skip-somef",
@@ -58,17 +58,21 @@ def cli():
         threshold = args.threshold
         somef_output_dir = os.path.join(os.getcwd(), "somef_outputs")
 
-        print(f"Detected {len(args.input)} input files:")
-        for json_path in args.input:
-            if not os.path.exists(json_path):
-                print(f"Skipping missing file: {json_path}")
-                continue
-            print(f"Processing repositories from {json_path}")
-            run_somef_batch(json_path, somef_output_dir, threshold)
+        print(f"Detected {len(args.input)} input(s):")
+
+        for input_item in args.input:
+            if input_item.startswith("http://") or input_item.startswith("https://"):
+                print(f"Processing repository URL: {input_item}")
+                run_somef_single(input_item, somef_output_dir, threshold)
+            elif os.path.exists(input_item):
+                print(f"Processing repositories from file: {input_item}")
+                run_somef_batch(input_item, somef_output_dir, threshold)
+            else:
+                print(f"Warning: Skipping invalid input (not a URL or existing file): {input_item}")
 
+        print(f"\nRunning analysis on outputs in {somef_output_dir}...")
         run_analysis(somef_output_dir, args.pitfalls_output, args.analysis_output)
 
 
 if __name__ == "__main__":
-    print("!!!THIS IS THE CORRECT TEST VERSION (MSR26) FIXING ONLY W003!!!")
     cli()
@@ -259,7 +259,7 @@ def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[s
                 "count": 0,
                 "percentage": 0.0,
                 "languages": {}
-            },
+            }
         ]
     }
 
 
@@ -1,7 +1,6 @@
 from typing import Dict
 import re
 
-
 def is_software_archive_url(url: str) -> bool:
     """
     Check if URL points to a software archive instead of a research paper.
 
@@ -1,6 +1,4 @@
-
 from typing import Dict
-import re
 from metacheck.utils.pitfall_utils import extract_metadata_source_filename
 
 def is_local_file_license(license_value: str) -> bool:
@@ -26,7 +24,7 @@ def is_local_file_license(license_value: str) -> bool:
         'copying', 'copying.md', 'copying.txt',
         'copyright', 'copyright.md', 'copyright.txt',
         'licence', 'licence.md', 'licence.txt',  # British spelling
-        'readme.md', 'doc.txt', 'file.rst'  # Other common file patterns
+        'readme.md', 'doc.txt', 'file.rst'
     ]
 
     if license_lower in license_file_names:
 
@@ -13,55 +13,38 @@ def detect_citation_missing_reference_publication_pitfall(somef_data: Dict, file
         "citation_cff_exists": False
     }
 
-    if "citation" not in somef_data:
-        return result
-
-    citation_entries = somef_data["citation"]
-    if not isinstance(citation_entries, list):
-        return result
-
-    codemeta_citation_value = None
-    citation_cff_citation_value = None
-    citation_cff_exists_in_somef = False
-
-    for entry in citation_entries:
-        source = entry.get("source", "")
-        technique = entry.get("technique", "")
-
-        if technique == "code_parser" and "codemeta.json" in source:
-            if "result" in entry and "value" in entry["result"]:
-                codemeta_citation_value = entry["result"]["value"]
-                result["codemeta_has_reference"] = True
-        elif "CITATION.cff" in source:
-            citation_cff_exists_in_somef = True
-            result["citation_cff_exists"] = True
-            if "result" in entry and "value" in entry["result"]:
-                citation_cff_citation_value = entry["result"]["value"]
-
-    if not citation_cff_exists_in_somef:
-        citation_cff_sources = ["authors", "title", "description", "version", "license"]
-        for category in citation_cff_sources:
-            if category in somef_data:
-                entries = somef_data[category]
-                if isinstance(entries, list):
-                    for entry in entries:
-                        source = entry.get("source", "")
-                        if "CITATION.cff" in source:
-                            citation_cff_exists_in_somef = True
-                            result["citation_cff_exists"] = True
-                            break
-
-    if (codemeta_citation_value and
-            citation_cff_exists_in_somef and
-            (not citation_cff_citation_value or citation_cff_citation_value != codemeta_citation_value)):
-
-        if citation_cff_citation_value:
-            if ("doi.org" in codemeta_citation_value or "http" in codemeta_citation_value):
-                if not ("doi.org" in citation_cff_citation_value or "http" in citation_cff_citation_value):
-                    result["has_pitfall"] = True
-                elif codemeta_citation_value not in citation_cff_citation_value and citation_cff_citation_value not in codemeta_citation_value:
-                    result["has_pitfall"] = True
-        else:
-            result["has_pitfall"] = True
+    if "reference_publication" in somef_data:
+        ref_pub_entries = somef_data["reference_publication"]
+        if isinstance(ref_pub_entries, list):
+            for entry in ref_pub_entries:
+                source = entry.get("source", "")
+                technique = entry.get("technique", "")
+
+                if technique == "code_parser" and "codemeta.json" in source:
+                    if "result" in entry and "value" in entry["result"]:
+                        result["codemeta_has_reference"] = True
+
+                elif "CITATION.cff" in source:
+                    if "result" in entry and "value" in entry["result"]:
+                        result["citation_cff_has_reference"] = True
+
+    citation_cff_sources = ["authors", "title", "description", "version", "license"]
+    for category in citation_cff_sources:
+        if category in somef_data:
+            entries = somef_data[category]
+            if isinstance(entries, list):
+                for entry in entries:
+                    source = entry.get("source", "")
+                    if "CITATION.cff" in source:
+                        result["citation_cff_exists"] = True
+                        break
+
+        if result["citation_cff_exists"]:
+            break
+
+    if (result["codemeta_has_reference"] and
+            result["citation_cff_exists"] and
+            not result["citation_cff_has_reference"]):
+        result["has_pitfall"] = True
 
     return result
@@ -1,3 +1,4 @@
+
 from typing import Dict
 from metacheck.utils.pitfall_utils import extract_metadata_source_filename
 
@@ -11,11 +12,16 @@ def is_repository_url(url: str) -> bool:
 
     url_lower = url.lower()
 
-    # Valid repository indicators
+    if 'github.io' in url_lower:
+        return False
+
     repo_indicators = [
         'github.com/',
+        'github.org/',
         'gitlab.com/',
+        'gitlab.org/',
         'bitbucket.org/',
+        'bitbucket.net/',
         'sourceforge.net/projects/',
         'git.',
         '.git'
@@ -37,7 +43,9 @@ def is_homepage_url_repo(url: str) -> bool:
 
     url_lower = url.lower()
 
-    # Homepage indicators
+    if is_repository_url(url):
+        return False
+
     homepage_indicators = [
         '.org/',
         '.com/',
@@ -50,11 +58,6 @@ def is_homepage_url_repo(url: str) -> bool:
         'github.io'
     ]
 
-    # If it's clearly a repository URL, it's not a homepage
-    if is_repository_url(url):
-        return False
-
-    # Check for homepage indicators
     for indicator in homepage_indicators:
         if indicator in url_lower:
             return True
@@ -82,15 +85,17 @@ def detect_coderepository_homepage_pitfall(somef_data: Dict, file_name: str) ->
     if not isinstance(repo_entries, list):
         return result
 
-    metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json", "pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
+    metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json",
+                        "pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
 
     for entry in repo_entries:
         technique = entry.get("technique", "")
         source = entry.get("source", "")
 
         is_metadata_source = (
-                technique in metadata_sources or
-                any(src in source.lower() for src in metadata_sources)
+            technique == "code_parser" or
+            technique in metadata_sources or
+            any(src in source.lower() for src in metadata_sources)
         )
 
         if is_metadata_source:
 
@@ -1,4 +1,3 @@
-
 import re
 from typing import Dict, Optional
 
@@ -18,7 +17,6 @@ def extract_license_from_file(somef_data: Dict) -> Optional[Dict[str, str]]:
     for entry in license_entries:
         if "source" in entry:
             source = entry["source"]
-            # Look for LICENSE files (LICENSE, LICENSE.md, etc.)
             if "LICENSE" in source.upper() and "result" in entry and "value" in entry["result"]:
                 return {
                     "source": source,
@@ -41,16 +39,14 @@ def check_copyright_only_license(license_content: str) -> bool:
     content_lower = license_content.lower().strip()
     content_lines = [line.strip() for line in license_content.strip().split('\n') if line.strip()]
 
-    # Patterns that indicate copyright-only content
     copyright_only_patterns = [
-        r'year\s*:\s*\d{4}',  # YEAR: 2017 (removed ^ and $ to match anywhere in text)
+        r'year\s*:\s*\d{4}',  # YEAR: 2017
         r'copyright\s+holder\s*:\s*[a-zA-Z]',  # COPYRIGHT HOLDER: Someone
         r'author\s*:\s*[a-zA-Z]',  # AUTHOR: Someone
         r'copyright\s*©?\s*\d{4}',  # Copyright 2017 or Copyright © 2017
         r'\(c\)\s*\d{4}',  # (C) 2017
     ]
 
-    # Patterns that indicate actual license terms
     license_term_patterns = [
         r'permission\s+is\s+hereby\s+granted',
         r'subject\s+to\s+the\s+following\s+conditions',
@@ -70,33 +66,36 @@ def check_copyright_only_license(license_content: str) -> bool:
     has_copyright_info = any(re.search(pattern, content_lower) for pattern in copyright_only_patterns)
     has_license_terms = any(re.search(pattern, content_lower) for pattern in license_term_patterns)
 
-    # If it has copyright info but no license terms and is short, it's likely copyright-only
+    if has_license_terms:
+        return False
+
+    # This will check if it has copyright info but no license terms and is short, it's likely copyright-only
     if has_copyright_info and not has_license_terms and len(content_lines) <= 10:
         return True
 
-    # Special case: check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
+    # Check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
     year_pattern_found = bool(re.search(r'year\s*:\s*\d{4}', content_lower))
     copyright_holder_pattern_found = bool(re.search(r'copyright\s+holder\s*:', content_lower))
 
     if year_pattern_found and copyright_holder_pattern_found:
+        if has_license_terms:
+            return False
         return True
 
-    # Additional check: if the content is very short and only contains basic copyright info
-    if len(content_lines) <= 5:  # Increased from 3 to 5 for more flexibility
-        # Check if all lines are just copyright/year information
+    if len(content_lines) <= 5:
         meaningful_lines = []
+
         for line in content_lines:
             line_lower = line.lower()
-            # Skip lines that are just copyright patterns
+
             if not any(re.search(pattern, line_lower) for pattern in copyright_only_patterns):
-                # This line doesn't match copyright patterns, check if it's meaningful
+
                 if (len(line.strip()) > 0 and
                     not line.strip().startswith('#') and
                     not line.strip().startswith('//') and
                     line.strip() not in ['', '-', '=', '*']):
                     meaningful_lines.append(line)
 
-        # If we have very few meaningful lines and some copyright info, it's probably copyright-only
         if len(meaningful_lines) <= 1 and has_copyright_info:
             return True
Original file line number	Diff line number	Diff line change
`@@ -259,7 +259,7 @@ def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[s`
`259`	`259`	`"count": 0,`
`260`	`260`	`"percentage": 0.0,`
`261`	`261`	`"languages": {}`
`262`		`- },`
	`262`	`+ }`
`263`	`263`	`]`
`264`	`264`	`}`
`265`	`265`