@@ -91,6 +91,8 @@ def read_segs(self, show_pbar=True):
9191 readers .append (read_wmt21_xml (p ))
9292 elif HF_EXT in self .ext :
9393 readers .append (self .read_hfds (p ))
94+ elif 'xlsx' in self .ext :
95+ readers .append (self .read_xlsx (p ))
9496 else :
9597 raise Exception (f'Not supported { self .ext } : { p } ' )
9698
@@ -152,6 +154,26 @@ def read_tsv(self, path, delim='\t', cols=None, skipheader=False, meta_fields=No
152154 out_row .append (metadata )
153155 yield out_row
154156
157+ def read_xlsx (self , path , cols = None ):
158+ """Read data from an Excel .xlsx file.
159+ :param path: path to .xlsx file
160+ :param cols: column indices to extract; default uses ent.cols or (0, 1)
161+ """
162+ try :
163+ from openpyxl import load_workbook
164+ except ImportError as e :
165+ raise ImportError ("openpyxl is required to read .xlsx files. Run: pip install openpyxl" ) from e
166+ if cols is None :
167+ cols = self .ent .cols if (self .ent and self .ent .cols ) else (0 , 1 )
168+ wb = load_workbook (path , read_only = True , data_only = True )
169+ ws = wb .active
170+ for row in ws .iter_rows (min_row = 2 , values_only = True ): # skip header
171+ out = [str (row [c ]).strip () if row [c ] is not None else '' for c in cols ]
172+ if all (v == '' for v in out ):
173+ continue
174+ yield out
175+ wb .close ()
176+
155177 @staticmethod
156178 def _nested_get (row , field ):
157179 """Get a value from a dict using dot-separated path for nested access.
@@ -176,11 +198,24 @@ def read_hfds(self, ds):
176198 # in the current version, I am going to retain all fields to see what all fields exist,
177199 # and map the subset of fields as per the dict; so, created rev_map.get(orig,orig)
178200 for row in ds :
179- out_row = [self ._nested_get (row , src_field )]
180- if tgt_field is not None :
181- out_row .append (self ._nested_get (row , tgt_field ))
182- # remap meta fields if necessary
201+ src_val = self ._nested_get (row , src_field )
202+ tgt_val = self ._nested_get (row , tgt_field ) if tgt_field else None
183203 top_keys = {f .split ('.' )[0 ] for f in [src_field ] + ([tgt_field ] if tgt_field else [])}
184204 metadata = {rev_map .get (k , k ): v for k , v in row .items () if k not in top_keys }
185- out_row .append (metadata )
186- yield out_row
205+
206+ src_is_list = isinstance (src_val , list )
207+ tgt_is_list = isinstance (tgt_val , list )
208+ if src_is_list and tgt_is_list :
209+ # Both lists (e.g. SmolDoc srcs/trgs): zip and yield each pair
210+ for s , t in zip (src_val , tgt_val ):
211+ yield [s , t , metadata ]
212+ elif not src_is_list and tgt_is_list :
213+ # Source is scalar, target is list (e.g. GATITOS src/trgs): expand
214+ for t in tgt_val :
215+ yield [src_val , t , metadata ]
216+ else :
217+ out_row = [src_val ]
218+ if tgt_val is not None :
219+ out_row .append (tgt_val )
220+ out_row .append (metadata )
221+ yield out_row
0 commit comments