@@ -766,13 +766,14 @@ def _get_comments(self, xml=None):
766766 def _find_licenses (self ):
767767 """Return an iterable of license mappings."""
768768 for lic in self .pom_data .findall ('licenses/license' ):
769- yield dict ([
770- ('name' , self ._get_attribute ('name' , lic )),
771- ('url' , self ._get_attribute ('url' , lic )),
772- ('comments' , self ._get_attribute ('comments' , lic )),
773- # arcane and seldom used
774- ('distribution' , self ._get_attribute ('distribution' , lic )),
775- ])
769+ yield {"license" : dict ([
770+ ('name' , self ._get_attribute ('name' , lic )),
771+ ('url' , self ._get_attribute ('url' , lic )),
772+ ('comments' , self ._get_attribute ('comments' , lic )),
773+ # arcane and seldom used
774+ ('distribution' , self ._get_attribute ('distribution' , lic )),
775+ ])
776+ }
776777
777778 def _find_parties (self , key = 'developers/developer' ):
778779 """Return an iterable of party mappings for a given xpath."""
@@ -1254,7 +1255,7 @@ def _parse(
12541255 # complex defeinition in Maven
12551256 qualifiers ['type' ] = extension
12561257
1257- extracted_license_statement = pom .licenses
1258+ extracted_license_statement = clean_licenses ( pom .licenses ) or None
12581259
12591260 group_id = pom .group_id
12601261 artifact_id = pom .artifact_id
@@ -1325,52 +1326,121 @@ def get_license_detections_for_extracted_license_statement(
13251326 approximate = True ,
13261327 expression_symbols = None ,
13271328 ):
1329+ """
1330+ Return license detections from a Maven POM license data structure.
1331+ This looks like this in XML, and some attributes are more important than others.
1332+ Which one exists and whether we can detect a proper license in each also determines which
1333+ attribute we need to consider.
1334+ The original XML has this shape:
1335+ <licenses>
1336+ <license>
1337+ <name>Apache-2.0</name>
1338+ <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
1339+ <distribution>repo</distribution>
1340+ <comments> notes... </comments>
1341+ </license>
1342+ </licenses>
1343+ The data structure we keep has this shape:
1344+ [{"license":
1345+ {
1346+ "name": "Apache-2.0",
1347+ "url": "https://www.apache.org/licenses/LICENSE-2.0.txt",
1348+ "comments": "notes...",
1349+ }
1350+ },
1351+ .... other license]
1352+ """
1353+
13281354 from packagedcode .licensing import get_normalized_license_detections
13291355 from packagedcode .licensing import get_license_detections_for_extracted_license_statement
13301356
1331- if not cls .check_extracted_license_statement_structure (extracted_license ):
1357+ if not is_standard_maven_license_data_structure (licenses = extracted_license ):
1358+ # use the generic detection
13321359 return get_normalized_license_detections (
13331360 extracted_license = extracted_license ,
13341361 try_as_expression = try_as_expression ,
13351362 approximate = approximate ,
13361363 expression_symbols = expression_symbols ,
13371364 )
1365+ extracted_license = clean_licenses (extracted_license )
1366+ extracted_license_statement = saneyaml .dump (extracted_license )
13381367
1339- new_extracted_license = extracted_license .copy ()
1340-
1341- for license_entry in new_extracted_license :
1342- license_entry .pop ("distribution" )
1343- if not license_entry .get ("name" ):
1344- license_entry .pop ("name" )
1345- if not license_entry .get ("url" ):
1346- license_entry .pop ("url" )
1347- if not license_entry .get ("comments" ):
1348- license_entry .pop ("comments" )
1349-
1350- extracted_license_statement = saneyaml .dump (new_extracted_license )
1351-
1352- return get_license_detections_for_extracted_license_statement (
1368+ detections = get_license_detections_for_extracted_license_statement (
13531369 extracted_license_statement = extracted_license_statement ,
13541370 try_as_expression = try_as_expression ,
13551371 approximate = approximate ,
13561372 expression_symbols = expression_symbols ,
13571373 )
1374+ # TODO: if we have any unknown license, we need to try harder
1375+ # We can detect each license item individually and check if the unknown was detected
1376+ # in the name, URL or comment field.
1377+ # name, URL, comments
1378+ # name unknwon: keep that unknown in all cases
1379+ # URL or comments with unknown, but name not unknown: we want to combine the unknown
1380+ # matches with the correct name match
13581381
1359- @classmethod
1360- def check_extracted_license_statement_structure (cls , extracted_license ):
1382+ return detections
13611383
1362- is_list_of_mappings = False
1363- if not isinstance (extracted_license , list ):
1364- return is_list_of_mappings
1365- else :
1366- is_list_of_mappings = True
13671384
1368- for extracted_license_item in extracted_license :
1369- if not isinstance (extracted_license_item , dict ):
1370- is_list_of_mappings = False
1371- break
1385+ def clean_licenses (licenses ):
1386+ """
1387+ Return a modified, cleaned ``licenses`` list of POM license data cleaned from unwanted data
1388+ (some fields, empty entries, etc).
1389+ Each item in the list has this shape:
1390+ [
1391+ {"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
1392+ {"license": {other fields} },
1393+ ]
1394+ """
1395+ for licitem in (licenses or []):
1396+ if not isinstance (licitem , dict ):
1397+ continue
1398+
1399+ license_attributes = licitem .get ("license" )
1400+ if not license_attributes or not len (licitem ) == 1 :
1401+ continue
1402+
1403+ license_attributes .pop ("distribution" , None )
1404+ if not license_attributes .get ("name" ):
1405+ license_attributes .pop ("name" , None )
1406+ if not license_attributes .get ("url" ):
1407+ license_attributes .pop ("url" , None )
1408+ if not license_attributes .get ("comments" ):
1409+ license_attributes .pop ("comments" , None )
1410+
1411+ return licenses
13721412
1373- return is_list_of_mappings
1413+
1414+ def is_standard_maven_license_data_structure (licenses ):
1415+ """
1416+ Return True if ``licenses`` has the structure expected from a Maven POM license data. The data
1417+ is a list of dicts of dicts, each top dict with a single item as {"license" : {mapping of
1418+ attributes}. We expect the POM license data to be in that shape in most cases, except for legacy
1419+ non POM 4 data.
1420+ Each item in the list has this shape:
1421+ [
1422+ {"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
1423+ {"license": {other fields} },
1424+ ]
1425+
1426+ """
1427+ if not isinstance (licenses , list ):
1428+ return False
1429+
1430+ fields = ("name" , "url" , "comment" ,)
1431+
1432+ for item in licenses :
1433+ if not isinstance (item , dict ):
1434+ return False
1435+ if not len (item ) == 1 :
1436+ return False
1437+ litem = item .get ('license' ) or {}
1438+ if not isinstance (litem , dict ):
1439+ return False
1440+ if not any (field in item for field in fields ):
1441+ return False
1442+
1443+ return True
13741444
13751445
13761446def build_vcs_and_code_view_urls (scm ):
0 commit comments