@@ -61,14 +61,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
6161 try :
6262 parsed_build_files = set ()
6363
64- for dir_path , dir_names , filenames in os .walk (repo_dir ):
65-
66- dir_names [:] = [d for d in dir_names if d .lower () not in constants .IGNORED_DIRS ]
67- if is_local_repo :
68- dir_names [:] = [d for d in dir_names if d .lower () != "lib" ]
69-
64+ for dir_path , dir_names , filenames in sorted (os .walk (repo_dir ),key = lambda x : x [0 ].count (os .sep )):
65+ dir_names .sort ()
66+ filenames .sort ()
67+ # dir_names[:] = [d for d in dir_names if d.lower() not in constants.IGNORED_DIRS]
68+ # if is_local_repo:
69+ # dir_names[:] = [d for d in dir_names if d.lower() != "lib"]
70+
71+ # repo_relative_path = os.path.relpath(dir_path, repo_dir)
72+ # current_dir = os.path.basename(repo_relative_path).lower()
7073 repo_relative_path = os .path .relpath (dir_path , repo_dir )
74+ path_parts = repo_relative_path .split (os .sep )
7175 current_dir = os .path .basename (repo_relative_path ).lower ()
76+ is_in_ignored = any (part .lower () in constants .IGNORED_DIRS for part in path_parts )
77+ is_lib = is_local_repo and "lib" in path_parts
78+ if is_in_ignored or is_lib :
79+ continue
7280 # if this is a test folder, we ignore it (except for the root repo)
7381 # if ignore_test_folder and repo_relative_path != "." and "test" in repo_relative_path.lower():
7482 if ignore_test_folder and repo_relative_path != "." and current_dir in constants .IGNORED_DIRS :
@@ -170,12 +178,12 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
170178 logging .error (f"{ type (err ).__name__ } was raised: { err } " )
171179 if ("LICENCE" == filename .upper () or "LICENSE" == filename .upper () or "LICENSE.MD" == filename .upper ()
172180 or "LICENSE.RST" == filename .upper ()):
181+
173182 metadata_result = get_file_content_or_link (repo_type , file_path , owner , repo_name ,
174- repo_default_branch ,
175- repo_dir , repo_relative_path , filename , dir_path ,
176- metadata_result , constants .CAT_LICENSE )
183+ repo_default_branch ,
184+ repo_dir , repo_relative_path , filename , dir_path ,
185+ metadata_result , constants .CAT_LICENSE )
177186
178-
179187 if "CODE_OF_CONDUCT" == filename .upper () or "CODE_OF_CONDUCT.MD" == filename .upper ():
180188 metadata_result = get_file_content_or_link (repo_type , file_path , owner , repo_name ,
181189 repo_default_branch ,
@@ -483,13 +491,26 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
483491 entry [constants .PROP_TECHNIQUE ] is constants .TECHNIQUE_FILE_EXPLORATION ):
484492 new_file_path = extract_directory_path (url )
485493 existing_path = extract_directory_path (entry [constants .PROP_SOURCE ])
486- if new_file_path . startswith ( existing_path ):
487- # the existing file is higher, ignore this one
494+
495+ if new_file_path != existing_path and new_file_path . startswith ( existing_path ):
488496 return metadata_result
489- else :
490- # replace result in hierarchy (below)
491- replace = True
492- break
497+ # if new_file_path.startswith(existing_path):
498+ # # the existing file is higher, ignore this one
499+ # return metadata_result
500+ # if existing_path == new_file_path:
501+ # return metadata_result
502+
503+ # if (category == constants.CAT_CITATION and existing_path == "." and new_file_path != "."):
504+ # return metadata_result
505+
506+ # replace = True
507+ # break
508+ # if repo_relative_path != "." and category == constants.CAT_LICENSE:
509+ # return metadata_result
510+ # else:
511+ # # replace result in hierarchy (below)
512+ # replace = True
513+ # break
493514 except Exception as e :
494515 logging .warning ("Error when trying to determine if redundant files exist " + str (e ))
495516 try :
@@ -500,9 +521,11 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
500521 constants .PROP_VALUE : file_text ,
501522 constants .PROP_TYPE : constants .FILE_DUMP
502523 }
524+
503525 if category is constants .CAT_LICENSE :
504526 license_text = file_text
505527 license_info = detect_license_spdx (license_text , 'JSON' )
528+
506529 if license_info :
507530 result [constants .PROP_NAME ] = license_info ['name' ]
508531 result [constants .PROP_SPDX_ID ] = license_info ['spdx_id' ]
@@ -514,6 +537,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
514537 matches_copyright = re .findall (constants .REGEXP_COPYRIGHT , license_text , flags = re .IGNORECASE )
515538
516539 for year , holder in matches_copyright :
540+
517541 holder = holder .strip () if holder else None
518542 year = year .strip () if year else None
519543
@@ -604,7 +628,7 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
604628 )
605629
606630 pref = yaml_content .get ("preferred-citation" )
607- if pref :
631+ if pref and not should_skip_citation ( metadata_result , url ) :
608632 pref_result = parse_cff_preferred (pref )
609633 pref_result [constants .PROP_VALUE ] = yaml .dump ({"preferred-citation" : pref }, default_flow_style = False )
610634 # pref_result[constants.PROP_TYPE] = constants.FILE_DUMP
@@ -769,6 +793,23 @@ def parse_cff_preferred(pref):
769793
770794 return clean_nulls (result )
771795
796+ def should_skip_citation (metadata_result , new_path ):
797+ for entry in metadata_result .results .get (constants .CAT_CITATION , []):
798+ if entry .get (constants .PROP_TECHNIQUE ) != constants .TECHNIQUE_FILE_EXPLORATION :
799+ continue
800+
801+ existing_path = extract_directory_path (entry [constants .PROP_SOURCE ])
802+
803+ # mismo fichero
804+ if existing_path == new_path :
805+ return True
806+
807+ # ya existe uno en raíz → bloquea subcarpetas
808+ if existing_path == "." :
809+ return True
810+
811+ return False
812+
772813def clean_nulls (d : dict ) -> dict :
773814 return {k : v for k , v in d .items () if v not in (None , "" )}
774815
@@ -790,14 +831,14 @@ def parse_license_cff(license_value, metadata_result, url):
790831 else :
791832 license_result [constants .PROP_NAME ] = license_value
792833
793-
794834 metadata_result .add_result (
795835 constants .CAT_LICENSE ,
796836 license_result ,
797837 1 ,
798838 constants .TECHNIQUE_FILE_EXPLORATION ,
799839 url
800840 )
841+
801842 except Exception as e :
802843 logging .error (f"Error parsing license from CFF: { str (e )} " )
803844
0 commit comments