@@ -19,6 +19,7 @@ def __init__(
1919 ):
2020 self .file_path = file_path
2121 self .file_name = file_name
22+ self .dataset = Path (file_name ).stem
2223 self .encoding = encoding
2324 self .variables_csv_path = (
2425 Path (variables_csv_path )
@@ -32,59 +33,87 @@ def __init__(
3233 )
3334
3435 def read (self ) -> dict :
35- dataset_name = Path (self .file_name ).stem .lower ()
36+ metadata = {}
37+ metadata .update (self .__dataset_metadata ())
38+ metadata .update (
39+ {
40+ "dataset_modification_date" : datetime .fromtimestamp (
41+ Path (self .file_path ).stat ().st_mtime
42+ ).isoformat (),
43+ "adam_info" : {
44+ "categorization_scheme" : {},
45+ "w_indexes" : {},
46+ "period" : {},
47+ "selection_algorithm" : {},
48+ },
49+ }
50+ )
51+ metadata .update (self .__variable_metadata ())
52+ metadata .update (self .__data_metadata ())
53+ return metadata
3654
37- if not self .variables_csv_path .exists ():
38- logger = logging .getLogger ("validator" )
39- logger .info ("No variables file found for %s" , dataset_name )
40- variables_meta = {}
41- else :
42- variables_meta = self .__get_variable_metadata (
43- dataset_name , self .variables_csv_path
55+ def __dataset_metadata (self ) -> dict :
56+ logger = logging .getLogger ("validator" )
57+
58+ if not self .datasets_csv_path .exists ():
59+ logger .info ("No datasets file found for %s" , self .dataset )
60+ return {"dataset_name" : self .dataset }
61+
62+ try :
63+ datasets_df = pd .read_csv (self .datasets_csv_path , encoding = self .encoding )
64+ except (UnicodeDecodeError , UnicodeError ) as e :
65+ logger .error (
66+ f"\n Error reading CSV from: { self .file_path } "
67+ f"\n Failed to decode with { self .encoding } encoding: { e } "
68+ f"\n Please specify the correct encoding using the -e flag."
4469 )
70+ return {}
71+ except Exception as e :
72+ logger .error ("Error reading CSV file %s. %s" , self .file_path , e )
73+ return {}
74+
75+ if "Filename" not in datasets_df .columns :
76+ return {}
77+
78+ match = datasets_df [datasets_df ["Filename" ] == self .dataset ]
79+
80+ if match .empty or len (match ) > 1 :
81+ return {}
82+
83+ single_match = match .iloc [0 ]
4584
46- metadata = {
47- "dataset_name" : dataset_name .upper (),
48- "dataset_modification_date" : datetime .fromtimestamp (
49- Path (self .file_path ).stat ().st_mtime
50- ).isoformat (),
51- "adam_info" : {
52- "categorization_scheme" : {},
53- "w_indexes" : {},
54- "period" : {},
55- "selection_algorithm" : {},
56- },
85+ return {
86+ "dataset_name" : (
87+ single_match ["Dataset Name" ]
88+ if "Dataset Name" in datasets_df .columns
89+ else str (single_match ["Filename" ]).upper ()
90+ ),
91+ "dataset_label" : str (single_match ["Label" ]),
5792 }
58- metadata .update (variables_meta )
59- metadata .update (self .__data_meta ())
60- metadata .update (self .__dataset_label ())
61- return metadata
6293
63- def __get_variable_metadata (
64- self , dataset_name : str , variables_file_path : Path
94+ def __variable_metadata (
95+ self ,
6596 ) -> dict :
6697 logger = logging .getLogger ("validator" )
98+ if not self .variables_csv_path .exists ():
99+ logger .info ("No variables file found for %s" , self .dataset )
100+ return {}
67101 try :
68- meta_df = pd .read_csv (variables_file_path , encoding = self .encoding )
102+ meta_df = pd .read_csv (self . variables_csv_path , encoding = self .encoding )
69103 except (UnicodeDecodeError , UnicodeError ) as e :
70104 logger .error (
71- f"Could not decode CSV file { variables_file_path } with { self .encoding } encoding: { e } . "
105+ f"Could not decode CSV file { self . variables_csv_path } with { self .encoding } encoding: { e } . "
72106 f"Please specify the correct encoding using the -e flag."
73107 )
74108 return {}
75109 except Exception as e :
76110 logger .error ("Error reading CSV file %s. %s" , self .file_path , e )
77111 return {}
78112
79- meta_df ["dataset" ] = meta_df ["dataset" ].apply (
80- lambda x : Path (str (x )).stem .lower ()
81- )
82-
83- dataset_meta_df = meta_df [meta_df ["dataset" ] == dataset_name ]
113+ dataset_meta_df = meta_df [meta_df ["dataset" ] == self .dataset ]
84114
85115 if dataset_meta_df .empty :
86- logger = logging .getLogger ("validator" )
87- logger .info ("No dataset metadata found for %s" , dataset_name )
116+ logger .info ("No dataset metadata found for %s" , self .dataset )
88117 return {}
89118
90119 variable_names = dataset_meta_df ["variable" ].tolist ()
@@ -95,7 +124,16 @@ def __get_variable_metadata(
95124 zip (variable_names , dataset_meta_df ["type" ])
96125 )
97126 variable_name_to_size_map = {
98- var : (int (length ) if pd .notna (length ) else None )
127+ var : (
128+ int (length )
129+ if pd .notna (length )
130+ and (
131+ # Because NaN is a float, pandas forces an array of integers with any missing values to become floating point
132+ isinstance (length , int | float )
133+ or (isinstance (length , str ) and length .isdigit ())
134+ )
135+ else None
136+ )
99137 for var , length in zip (variable_names , dataset_meta_df ["length" ])
100138 }
101139 return {
@@ -108,41 +146,7 @@ def __get_variable_metadata(
108146 "number_of_variables" : len (variable_names ),
109147 }
110148
111- def __dataset_label (self ) -> dict :
112- logger = logging .getLogger ("validator" )
113-
114- if not self .datasets_csv_path .exists ():
115- return {}
116-
117- try :
118- datasets_df = pd .read_csv (self .datasets_csv_path , encoding = self .encoding )
119- except (UnicodeDecodeError , UnicodeError ) as e :
120- logger .error (
121- f"\n Error reading CSV from: { self .file_path } "
122- f"\n Failed to decode with { self .encoding } encoding: { e } "
123- f"\n Please specify the correct encoding using the -e flag."
124- )
125- return {}
126- except Exception as e :
127- logger .error ("Error reading CSV file %s. %s" , self .file_path , e )
128- return {}
129-
130- if "Filename" not in datasets_df .columns or "Label" not in datasets_df .columns :
131- return {}
132-
133- datasets_df ["dataset" ] = datasets_df ["Filename" ].apply (
134- lambda x : Path (str (x )).stem .lower ()
135- )
136-
137- current_dataset = Path (self .file_name ).stem .lower ()
138- match = datasets_df [datasets_df ["dataset" ] == current_dataset ]
139-
140- if match .empty :
141- return {}
142-
143- return {"dataset_label" : str (match .iloc [0 ]["Label" ])}
144-
145- def __data_meta (self ):
149+ def __data_metadata (self ):
146150 logger = logging .getLogger ("validator" )
147151 result = {
148152 "dataset_length" : 0 ,
0 commit comments