Skip to content

Commit 01eb813

Browse files
Merge pull request #25 from Anas-Elhounsri/dev
Tests added, minor patches for pitfalls and warnings and updated setup.py to match the tag
2 parents 5da09e7 + a9043c8 commit 01eb813

47 files changed

Lines changed: 12983 additions & 408 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.idea/.gitignore

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="metacheck",
5-
version="0.1.0",
5+
version="0.1.1",
66
package_dir={"": "src"},
77
packages=find_namespace_packages(where="src"),
88
install_requires=[

src/metacheck/cli.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import argparse
22
import os
33
from pathlib import Path
4-
from metacheck.run_somef import run_somef_batch
4+
from metacheck.run_somef import run_somef_batch, run_somef_single
55
from metacheck.run_analyzer import run_analysis
66

77

@@ -11,7 +11,7 @@ def cli():
1111
"--input",
1212
nargs="+",
1313
required=True,
14-
help="One or more JSON files containing repositories (e.g., GitHub, GitLab) OR existing SoMEF output files when using --skip-somef."
14+
help="One or more: GitHub/GitLab URLs, JSON files containing repositories, OR existing SoMEF output files when using --skip-somef."
1515
)
1616
parser.add_argument(
1717
"--skip-somef",
@@ -58,17 +58,21 @@ def cli():
5858
threshold = args.threshold
5959
somef_output_dir = os.path.join(os.getcwd(), "somef_outputs")
6060

61-
print(f"Detected {len(args.input)} input files:")
62-
for json_path in args.input:
63-
if not os.path.exists(json_path):
64-
print(f"Skipping missing file: {json_path}")
65-
continue
66-
print(f"Processing repositories from {json_path}")
67-
run_somef_batch(json_path, somef_output_dir, threshold)
61+
print(f"Detected {len(args.input)} input(s):")
62+
63+
for input_item in args.input:
64+
if input_item.startswith("http://") or input_item.startswith("https://"):
65+
print(f"Processing repository URL: {input_item}")
66+
run_somef_single(input_item, somef_output_dir, threshold)
67+
elif os.path.exists(input_item):
68+
print(f"Processing repositories from file: {input_item}")
69+
run_somef_batch(input_item, somef_output_dir, threshold)
70+
else:
71+
print(f"Warning: Skipping invalid input (not a URL or existing file): {input_item}")
6872

73+
print(f"\nRunning analysis on outputs in {somef_output_dir}...")
6974
run_analysis(somef_output_dir, args.pitfalls_output, args.analysis_output)
7075

7176

7277
if __name__ == "__main__":
73-
print("!!!THIS IS THE CORRECT TEST VERSION (MSR26) FIXING ONLY W003!!!")
7478
cli()

src/metacheck/detect_pitfalls_main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def detect_all_pitfalls(json_files: Iterable[Path], pitfalls_output_dir: Union[s
259259
"count": 0,
260260
"percentage": 0.0,
261261
"languages": {}
262-
},
262+
}
263263
]
264264
}
265265

src/metacheck/scripts/pitfalls/p005.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import Dict
22
import re
33

4-
54
def is_software_archive_url(url: str) -> bool:
65
"""
76
Check if URL points to a software archive instead of a research paper.

src/metacheck/scripts/pitfalls/p006.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
21
from typing import Dict
3-
import re
42
from metacheck.utils.pitfall_utils import extract_metadata_source_filename
53

64
def is_local_file_license(license_value: str) -> bool:
@@ -26,7 +24,7 @@ def is_local_file_license(license_value: str) -> bool:
2624
'copying', 'copying.md', 'copying.txt',
2725
'copyright', 'copyright.md', 'copyright.txt',
2826
'licence', 'licence.md', 'licence.txt', # British spelling
29-
'readme.md', 'doc.txt', 'file.rst' # Other common file patterns
27+
'readme.md', 'doc.txt', 'file.rst'
3028
]
3129

3230
if license_lower in license_file_names:

src/metacheck/scripts/pitfalls/p007.py

Lines changed: 33 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -13,55 +13,38 @@ def detect_citation_missing_reference_publication_pitfall(somef_data: Dict, file
1313
"citation_cff_exists": False
1414
}
1515

16-
if "citation" not in somef_data:
17-
return result
18-
19-
citation_entries = somef_data["citation"]
20-
if not isinstance(citation_entries, list):
21-
return result
22-
23-
codemeta_citation_value = None
24-
citation_cff_citation_value = None
25-
citation_cff_exists_in_somef = False
26-
27-
for entry in citation_entries:
28-
source = entry.get("source", "")
29-
technique = entry.get("technique", "")
30-
31-
if technique == "code_parser" and "codemeta.json" in source:
32-
if "result" in entry and "value" in entry["result"]:
33-
codemeta_citation_value = entry["result"]["value"]
34-
result["codemeta_has_reference"] = True
35-
elif "CITATION.cff" in source:
36-
citation_cff_exists_in_somef = True
37-
result["citation_cff_exists"] = True
38-
if "result" in entry and "value" in entry["result"]:
39-
citation_cff_citation_value = entry["result"]["value"]
40-
41-
if not citation_cff_exists_in_somef:
42-
citation_cff_sources = ["authors", "title", "description", "version", "license"]
43-
for category in citation_cff_sources:
44-
if category in somef_data:
45-
entries = somef_data[category]
46-
if isinstance(entries, list):
47-
for entry in entries:
48-
source = entry.get("source", "")
49-
if "CITATION.cff" in source:
50-
citation_cff_exists_in_somef = True
51-
result["citation_cff_exists"] = True
52-
break
53-
54-
if (codemeta_citation_value and
55-
citation_cff_exists_in_somef and
56-
(not citation_cff_citation_value or citation_cff_citation_value != codemeta_citation_value)):
57-
58-
if citation_cff_citation_value:
59-
if ("doi.org" in codemeta_citation_value or "http" in codemeta_citation_value):
60-
if not ("doi.org" in citation_cff_citation_value or "http" in citation_cff_citation_value):
61-
result["has_pitfall"] = True
62-
elif codemeta_citation_value not in citation_cff_citation_value and citation_cff_citation_value not in codemeta_citation_value:
63-
result["has_pitfall"] = True
64-
else:
65-
result["has_pitfall"] = True
16+
if "reference_publication" in somef_data:
17+
ref_pub_entries = somef_data["reference_publication"]
18+
if isinstance(ref_pub_entries, list):
19+
for entry in ref_pub_entries:
20+
source = entry.get("source", "")
21+
technique = entry.get("technique", "")
22+
23+
if technique == "code_parser" and "codemeta.json" in source:
24+
if "result" in entry and "value" in entry["result"]:
25+
result["codemeta_has_reference"] = True
26+
27+
elif "CITATION.cff" in source:
28+
if "result" in entry and "value" in entry["result"]:
29+
result["citation_cff_has_reference"] = True
30+
31+
citation_cff_sources = ["authors", "title", "description", "version", "license"]
32+
for category in citation_cff_sources:
33+
if category in somef_data:
34+
entries = somef_data[category]
35+
if isinstance(entries, list):
36+
for entry in entries:
37+
source = entry.get("source", "")
38+
if "CITATION.cff" in source:
39+
result["citation_cff_exists"] = True
40+
break
41+
42+
if result["citation_cff_exists"]:
43+
break
44+
45+
if (result["codemeta_has_reference"] and
46+
result["citation_cff_exists"] and
47+
not result["citation_cff_has_reference"]):
48+
result["has_pitfall"] = True
6649

6750
return result

src/metacheck/scripts/pitfalls/p009.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
from typing import Dict
23
from metacheck.utils.pitfall_utils import extract_metadata_source_filename
34

@@ -11,11 +12,16 @@ def is_repository_url(url: str) -> bool:
1112

1213
url_lower = url.lower()
1314

14-
# Valid repository indicators
15+
if 'github.io' in url_lower:
16+
return False
17+
1518
repo_indicators = [
1619
'github.com/',
20+
'github.org/',
1721
'gitlab.com/',
22+
'gitlab.org/',
1823
'bitbucket.org/',
24+
'bitbucket.net/',
1925
'sourceforge.net/projects/',
2026
'git.',
2127
'.git'
@@ -37,7 +43,9 @@ def is_homepage_url_repo(url: str) -> bool:
3743

3844
url_lower = url.lower()
3945

40-
# Homepage indicators
46+
if is_repository_url(url):
47+
return False
48+
4149
homepage_indicators = [
4250
'.org/',
4351
'.com/',
@@ -50,11 +58,6 @@ def is_homepage_url_repo(url: str) -> bool:
5058
'github.io'
5159
]
5260

53-
# If it's clearly a repository URL, it's not a homepage
54-
if is_repository_url(url):
55-
return False
56-
57-
# Check for homepage indicators
5861
for indicator in homepage_indicators:
5962
if indicator in url_lower:
6063
return True
@@ -82,15 +85,17 @@ def detect_coderepository_homepage_pitfall(somef_data: Dict, file_name: str) ->
8285
if not isinstance(repo_entries, list):
8386
return result
8487

85-
metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json", "pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
88+
metadata_sources = ["codemeta.json", "DESCRIPTION", "composer.json", "package.json",
89+
"pom.xml", "pyproject.toml", "requirements.txt", "setup.py"]
8690

8791
for entry in repo_entries:
8892
technique = entry.get("technique", "")
8993
source = entry.get("source", "")
9094

9195
is_metadata_source = (
92-
technique in metadata_sources or
93-
any(src in source.lower() for src in metadata_sources)
96+
technique == "code_parser" or
97+
technique in metadata_sources or
98+
any(src in source.lower() for src in metadata_sources)
9499
)
95100

96101
if is_metadata_source:

src/metacheck/scripts/pitfalls/p010.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
import re
32
from typing import Dict, Optional
43

@@ -18,7 +17,6 @@ def extract_license_from_file(somef_data: Dict) -> Optional[Dict[str, str]]:
1817
for entry in license_entries:
1918
if "source" in entry:
2019
source = entry["source"]
21-
# Look for LICENSE files (LICENSE, LICENSE.md, etc.)
2220
if "LICENSE" in source.upper() and "result" in entry and "value" in entry["result"]:
2321
return {
2422
"source": source,
@@ -41,16 +39,14 @@ def check_copyright_only_license(license_content: str) -> bool:
4139
content_lower = license_content.lower().strip()
4240
content_lines = [line.strip() for line in license_content.strip().split('\n') if line.strip()]
4341

44-
# Patterns that indicate copyright-only content
4542
copyright_only_patterns = [
46-
r'year\s*:\s*\d{4}', # YEAR: 2017 (removed ^ and $ to match anywhere in text)
43+
r'year\s*:\s*\d{4}', # YEAR: 2017
4744
r'copyright\s+holder\s*:\s*[a-zA-Z]', # COPYRIGHT HOLDER: Someone
4845
r'author\s*:\s*[a-zA-Z]', # AUTHOR: Someone
4946
r'copyright\s*©?\s*\d{4}', # Copyright 2017 or Copyright © 2017
5047
r'\(c\)\s*\d{4}', # (C) 2017
5148
]
5249

53-
# Patterns that indicate actual license terms
5450
license_term_patterns = [
5551
r'permission\s+is\s+hereby\s+granted',
5652
r'subject\s+to\s+the\s+following\s+conditions',
@@ -70,33 +66,36 @@ def check_copyright_only_license(license_content: str) -> bool:
7066
has_copyright_info = any(re.search(pattern, content_lower) for pattern in copyright_only_patterns)
7167
has_license_terms = any(re.search(pattern, content_lower) for pattern in license_term_patterns)
7268

73-
# If it has copyright info but no license terms and is short, it's likely copyright-only
69+
if has_license_terms:
70+
return False
71+
72+
# This will check if it has copyright info but no license terms and is short, it's likely copyright-only
7473
if has_copyright_info and not has_license_terms and len(content_lines) <= 10:
7574
return True
7675

77-
# Special case: check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
76+
# Check for the exact format "YEAR: xxxx" and "COPYRIGHT HOLDER: xxxx"
7877
year_pattern_found = bool(re.search(r'year\s*:\s*\d{4}', content_lower))
7978
copyright_holder_pattern_found = bool(re.search(r'copyright\s+holder\s*:', content_lower))
8079

8180
if year_pattern_found and copyright_holder_pattern_found:
81+
if has_license_terms:
82+
return False
8283
return True
8384

84-
# Additional check: if the content is very short and only contains basic copyright info
85-
if len(content_lines) <= 5: # Increased from 3 to 5 for more flexibility
86-
# Check if all lines are just copyright/year information
85+
if len(content_lines) <= 5:
8786
meaningful_lines = []
87+
8888
for line in content_lines:
8989
line_lower = line.lower()
90-
# Skip lines that are just copyright patterns
90+
9191
if not any(re.search(pattern, line_lower) for pattern in copyright_only_patterns):
92-
# This line doesn't match copyright patterns, check if it's meaningful
92+
9393
if (len(line.strip()) > 0 and
9494
not line.strip().startswith('#') and
9595
not line.strip().startswith('//') and
9696
line.strip() not in ['', '-', '=', '*']):
9797
meaningful_lines.append(line)
9898

99-
# If we have very few meaningful lines and some copyright info, it's probably copyright-only
10099
if len(meaningful_lines) <= 1 and has_copyright_info:
101100
return True
102101

0 commit comments

Comments
 (0)