44import os
55from typing import Any , Dict , List
66
7- IGNORE_KEYS = {" retrieved_timestamp" , " evaluation_id" }
7+ IGNORE_KEYS = {' retrieved_timestamp' , ' evaluation_id' }
88
99
1010def expand_paths (paths : List [str ]) -> List [str ]:
1111 """Expand folders to file paths."""
1212 file_paths : List [str ] = []
1313 for path in paths :
14- if os .path .isfile (path ) and path .endswith (" .json" ):
14+ if os .path .isfile (path ) and path .endswith (' .json' ):
1515 file_paths .append (path )
1616 elif os .path .isdir (path ):
1717 for root , _ , file_names in os .walk (path ):
1818 for file_name in file_names :
19- if file_name .endswith (" .json" ):
19+ if file_name .endswith (' .json' ):
2020 file_paths .append (os .path .join (root , file_name ))
2121 else :
22- raise Exception (f" Could not find file or directory at path: { path } " )
22+ raise Exception (f' Could not find file or directory at path: { path } ' )
2323 return file_paths
2424
2525
2626def annotate_error (file_path : str , message : str , ** kwargs ) -> None :
2727 """If run in GitHub Actions, annotate errors."""
28- if os .environ .get ("GITHUB_ACTION" ):
29- joined_kwargs = "" .join (f",{ key } ={ value } " for key , value in kwargs .items ())
30- print (f"::error file={ file_path } { joined_kwargs } ::{ message } " )
28+ if os .environ .get ('GITHUB_ACTION' ):
29+ joined_kwargs = '' .join (
30+ f',{ key } ={ value } ' for key , value in kwargs .items ()
31+ )
32+ print (f'::error file={ file_path } { joined_kwargs } ::{ message } ' )
3133
3234
3335def normalize_list (items : List [Any ]) -> List [Any ]:
@@ -36,7 +38,7 @@ def normalize_list(items: List[Any]) -> List[Any]:
3638 return sorted (
3739 normalized_items ,
3840 key = lambda item : json .dumps (
39- item , sort_keys = True , separators = ("," , ":" ), ensure_ascii = True
41+ item , sort_keys = True , separators = (',' , ':' ), ensure_ascii = True
4042 ),
4143 )
4244
@@ -58,80 +60,84 @@ def normalized_hash(payload: Dict[str, Any]) -> str:
5860 encoded = json .dumps (
5961 normalized ,
6062 sort_keys = True ,
61- separators = ("," , ":" ),
63+ separators = (',' , ':' ),
6264 ensure_ascii = True ,
6365 )
64- return hashlib .sha256 (encoded .encode (" utf-8" )).hexdigest ()
66+ return hashlib .sha256 (encoded .encode (' utf-8' )).hexdigest ()
6567
6668
6769def main (argv : List [str ] | None = None ) -> int :
6870 parser = argparse .ArgumentParser (
69- prog = " check_duplicate_entries" ,
70- description = " Detects duplicate evaluation entries ignoring scrape timestamp fields." ,
71+ prog = ' check_duplicate_entries' ,
72+ description = ' Detects duplicate evaluation entries ignoring scrape timestamp fields.' ,
7173 )
7274 parser .add_argument (
73- " paths" , nargs = "+" , type = str , help = " File or folder paths to JSON data"
75+ ' paths' , nargs = '+' , type = str , help = ' File or folder paths to JSON data'
7476 )
7577 args = parser .parse_args (argv )
7678
7779 file_paths = expand_paths (args .paths )
7880 print ()
79- print (f" Checking { len (file_paths )} JSON files for duplicates..." )
81+ print (f' Checking { len (file_paths )} JSON files for duplicates...' )
8082 print ()
8183
8284 groups : Dict [str , List [Dict [str , Any ]]] = {}
8385 for file_path in file_paths :
8486 try :
85- with open (file_path , "r" ) as f :
87+ with open (file_path , 'r' ) as f :
8688 payload = json .load (f )
8789 except json .JSONDecodeError as e :
88- message = f" JSONDecodeError: { str (e )} "
90+ message = f' JSONDecodeError: { str (e )} '
8991 annotate_error (
9092 file_path ,
9193 message ,
92- title = " JSONDecodeError" ,
94+ title = ' JSONDecodeError' ,
9395 col = e .colno ,
9496 line = e .lineno ,
9597 )
96- print (f" { file_path } " )
97- print (" " + message )
98+ print (f' { file_path } ' )
99+ print (' ' + message )
98100 print ()
99101 raise
100102
101103 entry_hash = normalized_hash (payload )
102104 groups .setdefault (entry_hash , []).append (
103105 {
104- " path" : file_path ,
105- " evaluation_id" : payload .get (" evaluation_id" ),
106- " retrieved_timestamp" : payload .get (" retrieved_timestamp" ),
106+ ' path' : file_path ,
107+ ' evaluation_id' : payload .get (' evaluation_id' ),
108+ ' retrieved_timestamp' : payload .get (' retrieved_timestamp' ),
107109 }
108110 )
109111
110- duplicate_groups = [entries for entries in groups .values () if len (entries ) > 1 ]
112+ duplicate_groups = [
113+ entries for entries in groups .values () if len (entries ) > 1
114+ ]
111115 if not duplicate_groups :
112- print (" No duplicates found." )
116+ print (' No duplicates found.' )
113117 print ()
114118 return 0
115119
116- ignore_label = ", " .join (f" `{ key } `" for key in sorted (IGNORE_KEYS ))
117- print (f" Found duplicate entries (ignoring keys: { ignore_label } )." )
120+ ignore_label = ', ' .join (f' `{ key } `' for key in sorted (IGNORE_KEYS ))
121+ print (f' Found duplicate entries (ignoring keys: { ignore_label } ).' )
118122 print ()
119123
120124 for index , entries in enumerate (duplicate_groups , start = 1 ):
121- print (f" Duplicate group { index } ({ len (entries )} files):" )
125+ print (f' Duplicate group { index } ({ len (entries )} files):' )
122126 for entry in entries :
123- print (f" - { entry ['path' ]} " )
124- print (f" evaluation_id: { entry .get ('evaluation_id' )} " )
125- print (f" retrieved_timestamp: { entry .get ('retrieved_timestamp' )} " )
127+ print (f' - { entry ["path" ]} ' )
128+ print (f' evaluation_id: { entry .get ("evaluation_id" )} ' )
129+ print (
130+ f' retrieved_timestamp: { entry .get ("retrieved_timestamp" )} '
131+ )
126132 annotate_error (
127- entry [" path" ],
128- " Duplicate entry detected (ignoring `evaluation_id` and `retrieved_timestamp`)." ,
129- title = " DuplicateEntry" ,
133+ entry [' path' ],
134+ ' Duplicate entry detected (ignoring `evaluation_id` and `retrieved_timestamp`).' ,
135+ title = ' DuplicateEntry' ,
130136 )
131137 print ()
132138
133139 return 1
134140
135141
136- if __name__ == " __main__" :
142+ if __name__ == ' __main__' :
137143 raise SystemExit (main ())
0 commit comments