Skip to content

Commit 1229323

Browse files
committed
packagedcode: fix publiccode license extraction
Signed-off-by: kumarasantosh <santosh.pulikond02@gmail.com>
1 parent 98a1fc0 commit 1229323

5 files changed

Lines changed: 427 additions & 108 deletions

File tree

src/packagedcode/publiccode.py

Lines changed: 96 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
import logging
11-
import os
10+
import io
1211

1312
import saneyaml
1413

@@ -20,96 +19,52 @@
2019
See https://github.com/publiccodeyml/publiccode.yml
2120
"""
2221

23-
TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE', False)
24-
25-
logger = logging.getLogger(__name__)
22+
EXTRA_DATA_KEYS = (
23+
'publiccodeYmlVersion',
24+
'platforms',
25+
'developmentStatus',
26+
'softwareType',
27+
)
2628

2729

2830
class PubliccodeYmlHandler(models.DatafileHandler):
2931
datasource_id = 'publiccode_yml'
30-
path_patterns = ('*/publiccode.yml', '*/publiccode.yaml')
32+
path_patterns = ('*publiccode.yml', '*publiccode.yaml')
3133
default_package_type = 'publiccode'
3234
default_primary_language = None
3335
description = 'publiccode.yml metadata file'
3436
documentation_url = 'https://github.com/publiccodeyml/publiccode.yml'
3537

3638
@classmethod
3739
def parse(cls, location, package_only=False):
38-
with open(location, 'rb') as f:
39-
data = saneyaml.load(f.read())
40-
41-
if not data or not isinstance(data, dict):
42-
return
40+
with io.open(location, encoding='utf-8') as loc:
41+
data = saneyaml.load(loc.read())
4342

44-
# Validate: a publiccode.yml must have 'publiccodeYmlVersion'
45-
if 'publiccodeYmlVersion' not in data:
43+
if not is_publiccode_yml_data(data):
4644
return
4745

48-
name = data.get('name')
49-
version = data.get('softwareVersion')
50-
vcs_url = data.get('url')
51-
homepage_url = data.get('landingURL') or vcs_url
52-
53-
# License is under legal.license (SPDX expression)
54-
legal = data.get('legal') or {}
55-
declared_license = legal.get('license')
56-
copyright_statement = legal.get('mainCopyrightOwner') or legal.get('repoOwner')
57-
58-
# Description: prefer English, fall back to first available language
59-
description = _get_description(data)
60-
61-
# Keywords from categories
62-
categories = data.get('categories') or []
63-
keywords = ', '.join(categories) if categories else None
64-
65-
# Parties from maintenance.contacts
66-
parties = []
67-
maintenance = data.get('maintenance') or {}
68-
for contact in maintenance.get('contacts') or []:
69-
contact_name = contact.get('name')
70-
contact_email = contact.get('email')
71-
if contact_name or contact_email:
72-
parties.append(
73-
models.Party(
74-
type=models.party_person,
75-
name=contact_name,
76-
email=contact_email,
77-
role='maintainer',
78-
)
79-
)
80-
81-
# Extra data
82-
extra_data = {}
83-
schema_version = data.get('publiccodeYmlVersion')
84-
if schema_version:
85-
extra_data['publiccodeYmlVersion'] = schema_version
86-
platforms = data.get('platforms')
87-
if platforms:
88-
extra_data['platforms'] = platforms
89-
development_status = data.get('developmentStatus')
90-
if development_status:
91-
extra_data['developmentStatus'] = development_status
92-
software_type = data.get('softwareType')
93-
if software_type:
94-
extra_data['softwareType'] = software_type
95-
96-
yield models.PackageData(
46+
package_data = dict(
9747
datasource_id=cls.datasource_id,
9848
type=cls.default_package_type,
99-
name=name,
100-
version=version,
101-
vcs_url=vcs_url,
102-
homepage_url=homepage_url,
103-
description=description,
104-
declared_license_expression=declared_license,
105-
copyright=copyright_statement,
106-
keywords=keywords,
107-
parties=parties,
108-
extra_data=extra_data or None,
49+
name=data.get('name'),
50+
version=data.get('softwareVersion'),
51+
vcs_url=data.get('url'),
52+
homepage_url=data.get('landingURL') or data.get('url'),
53+
description=get_description(data),
54+
extracted_license_statement=get_extracted_license_statement(data),
55+
copyright=get_copyright_statement(data),
56+
keywords=get_categories(data),
57+
parties=get_parties(data),
58+
extra_data=get_extra_data(data) or None,
10959
)
60+
yield models.PackageData.from_data(package_data, package_only)
61+
11062

63+
def is_publiccode_yml_data(data):
64+
return isinstance(data, dict) and 'publiccodeYmlVersion' in data
11165

112-
def _get_description(data):
66+
67+
def get_description(data):
11368
"""
11469
Extract the best available description from publiccode.yml's
11570
multilingual 'description' block. Prefer English, fall back to
@@ -119,15 +74,78 @@ def _get_description(data):
11974
if not description_block:
12075
return
12176

122-
lang_data = (
123-
description_block.get('en')
124-
or description_block.get('eng')
125-
or next(iter(description_block.values()), None)
126-
)
77+
lang_data = None
78+
for language, localized_description in description_block.items():
79+
primary_language = language.lower().split('-')[0]
80+
if primary_language == 'en':
81+
lang_data = localized_description
82+
break
83+
84+
if not lang_data:
85+
lang_data = next(iter(description_block.values()), None)
86+
12787
if not lang_data:
12888
return
12989

13090
long_desc = lang_data.get('longDescription', '').strip()
13191
short_desc = lang_data.get('shortDescription', '').strip()
13292

13393
return long_desc or short_desc or None
94+
95+
96+
def get_extracted_license_statement(data):
97+
legal = data.get('legal') or {}
98+
return legal.get('license')
99+
100+
101+
def get_copyright_statement(data):
102+
legal = data.get('legal') or {}
103+
copyright_holders = []
104+
105+
for key in ('mainCopyrightOwner', 'repoOwner'):
106+
value = legal.get(key)
107+
if value and value not in copyright_holders:
108+
copyright_holders.append(value)
109+
110+
return '\n'.join(copyright_holders) or None
111+
112+
113+
def get_categories(data):
114+
categories = data.get('categories') or []
115+
if isinstance(categories, str):
116+
return [categories]
117+
return categories
118+
119+
120+
def get_parties(data):
121+
parties = []
122+
maintenance = data.get('maintenance') or {}
123+
124+
for contact in maintenance.get('contacts') or []:
125+
contact_name = contact.get('name')
126+
contact_email = contact.get('email')
127+
128+
if not (contact_name or contact_email):
129+
continue
130+
131+
parties.append(
132+
models.Party(
133+
type=models.party_person,
134+
name=contact_name,
135+
email=contact_email,
136+
role='maintainer',
137+
)
138+
)
139+
140+
return parties
141+
142+
143+
def get_extra_data(data):
144+
extra_data = {}
145+
146+
for key in EXTRA_DATA_KEYS:
147+
value = data.get(key)
148+
if value:
149+
extra_data[key] = value
150+
151+
return extra_data

tests/packagedcode/data/publiccode/publiccode.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Hand-crafted publiccode.yml test fixture based on examples from:
2+
# https://github.com/publiccodeyml/publiccode.yml/blob/main/docs/standard/schema.core.rst
13
publiccodeYmlVersion: "0.4"
24

35
name: Medusa
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
[
2+
{
3+
"type": "publiccode",
4+
"namespace": null,
5+
"name": "Medusa",
6+
"version": "1.0.3",
7+
"qualifiers": {},
8+
"subpath": null,
9+
"primary_language": null,
10+
"description": "A very long description of this software. It explains what it does, who it is for, and why you might want to use it in a public administration context.",
11+
"release_date": null,
12+
"parties": [
13+
{
14+
"type": "person",
15+
"role": "maintainer",
16+
"name": "Francesco Rossi",
17+
"email": "f.rossi@example.com",
18+
"url": null
19+
}
20+
],
21+
"keywords": [
22+
"financial-reporting",
23+
"accounting"
24+
],
25+
"homepage_url": "https://example.com/medusa",
26+
"download_url": null,
27+
"size": null,
28+
"sha1": null,
29+
"md5": null,
30+
"sha256": null,
31+
"sha512": null,
32+
"bug_tracking_url": null,
33+
"code_view_url": null,
34+
"vcs_url": "https://example.com/italia/medusa.git",
35+
"copyright": "City of Example",
36+
"holder": "City of Example",
37+
"declared_license_expression": "agpl-3.0-plus",
38+
"declared_license_expression_spdx": "AGPL-3.0-or-later",
39+
"license_detections": [
40+
{
41+
"license_expression": "agpl-3.0-plus",
42+
"license_expression_spdx": "AGPL-3.0-or-later",
43+
"matches": [
44+
{
45+
"license_expression": "agpl-3.0-plus",
46+
"license_expression_spdx": "AGPL-3.0-or-later",
47+
"from_file": null,
48+
"start_line": 1,
49+
"end_line": 1,
50+
"matcher": "1-hash",
51+
"score": 100.0,
52+
"matched_length": 5,
53+
"match_coverage": 100.0,
54+
"rule_relevance": 100,
55+
"rule_identifier": "spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE",
56+
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE",
57+
"matched_text": "AGPL-3.0-or-later"
58+
}
59+
],
60+
"identifier": "agpl_3_0_plus-a0f62d44-7e99-852b-0b1c-0bc5e1c9f6d0"
61+
}
62+
],
63+
"other_license_expression": null,
64+
"other_license_expression_spdx": null,
65+
"other_license_detections": [],
66+
"extracted_license_statement": "AGPL-3.0-or-later",
67+
"notice_text": null,
68+
"source_packages": [],
69+
"file_references": [],
70+
"is_private": false,
71+
"is_virtual": false,
72+
"extra_data": {
73+
"publiccodeYmlVersion": "0.4",
74+
"platforms": [
75+
"web",
76+
"linux"
77+
],
78+
"developmentStatus": "stable",
79+
"softwareType": "standalone/desktop"
80+
},
81+
"dependencies": [],
82+
"repository_homepage_url": null,
83+
"repository_download_url": null,
84+
"api_data_url": null,
85+
"datasource_id": "publiccode_yml",
86+
"purl": "pkg:publiccode/Medusa@1.0.3"
87+
}
88+
]

0 commit comments

Comments
 (0)