@@ -107,7 +107,7 @@ def valid_extensions(formats):
107107
108108def ratio (
109109 input , output = "output" , seed = 1337 , ratio = (0.8 , 0.1 , 0.1 ),
110- group_prefix = None , group = None , move = False , formats = None ,
110+ group_prefix = None , group = None , move = False , formats = None , shuffle = True ,
111111):
112112 if not round (sum (ratio ), 5 ) == 1 : # round for floating imprecision
113113 raise ValueError ("The sums of `ratio` is over 1." )
@@ -121,11 +121,14 @@ def ratio(
121121 prog_bar = tqdm (desc = "Copying files" , unit = " files" )
122122
123123 if group == "sibling" :
124- split_sibling_dirs_ratio (input , output , ratio , seed , prog_bar if use_tqdm else None , move , formats )
124+ split_sibling_dirs_ratio (
125+ input , output , ratio , seed , prog_bar if use_tqdm else None , move , formats , shuffle ,
126+ )
125127 else :
126128 for class_dir in list_dirs (input ):
127129 split_class_dir_ratio (
128- class_dir , output , ratio , seed , prog_bar if use_tqdm else None , group_prefix , group , move , formats
130+ class_dir , output , ratio , seed , prog_bar if use_tqdm else None ,
131+ group_prefix , group , move , formats , shuffle ,
129132 )
130133
131134 if use_tqdm :
@@ -134,7 +137,7 @@ def ratio(
134137
135138def fixed (
136139 input , output = "output" , seed = 1337 , fixed = (100 , 100 ), oversample = False ,
137- group_prefix = None , group = None , move = False , formats = None ,
140+ group_prefix = None , group = None , move = False , formats = None , shuffle = True ,
138141):
139142 check_input_format (input )
140143 valid_extensions (formats )
@@ -170,7 +173,9 @@ def fixed(
170173 prog_bar = tqdm (desc = "Copying files" , unit = " files" )
171174
172175 if group == "sibling" :
173- split_sibling_dirs_fixed (input , output , fixed , seed , prog_bar if use_tqdm else None , move , formats )
176+ split_sibling_dirs_fixed (
177+ input , output , fixed , seed , prog_bar if use_tqdm else None , move , formats , shuffle ,
178+ )
174179 if use_tqdm :
175180 prog_bar .close ()
176181 return
@@ -180,7 +185,8 @@ def fixed(
180185 for class_dir in classes_dirs :
181186 num_items .append (
182187 split_class_dir_fixed (
183- class_dir , output , fixed , seed , prog_bar if use_tqdm else None , group_prefix , group , move , formats
188+ class_dir , output , fixed , seed , prog_bar if use_tqdm else None ,
189+ group_prefix , group , move , formats , shuffle ,
184190 )
185191 )
186192
@@ -222,7 +228,10 @@ def fixed(
222228 shutil .copy2 (str (f_orig ), str (f_dest ))
223229
224230
225- def kfold (input , output = "output" , seed = 1337 , k = 5 , group_prefix = None , group = None , move = "symlink" , formats = None ):
231+ def kfold (
232+ input , output = "output" , seed = 1337 , k = 5 , group_prefix = None , group = None ,
233+ move = "symlink" , formats = None , shuffle = True ,
234+ ):
226235 if k < 2 :
227236 raise ValueError ("`k` must be 2 or greater." )
228237
@@ -233,24 +242,26 @@ def kfold(input, output="output", seed=1337, k=5, group_prefix=None, group=None,
233242 prog_bar = tqdm (desc = "Copying files" , unit = " files" )
234243
235244 if group == "sibling" :
236- split_sibling_dirs_kfold (input , output , k , seed , prog_bar if use_tqdm else None , move , formats )
245+ split_sibling_dirs_kfold (
246+ input , output , k , seed , prog_bar if use_tqdm else None , move , formats , shuffle ,
247+ )
237248 else :
238249 for class_dir in list_dirs (input ):
239250 split_class_dir_kfold (
240251 class_dir , output , k , seed , prog_bar if use_tqdm else None ,
241- group_prefix , group , move , formats ,
252+ group_prefix , group , move , formats , shuffle ,
242253 )
243254
244255 if use_tqdm :
245256 prog_bar .close ()
246257
247258
248- def split_class_dir_kfold (class_dir , output , k , seed , prog_bar , group_prefix , group , move , formats ):
259+ def split_class_dir_kfold (class_dir , output , k , seed , prog_bar , group_prefix , group , move , formats , shuffle = True ):
249260 """
250261 Splits a class folder into k folds for cross-validation.
251262 Each fold directory gets train/ and val/ subdirectories.
252263 """
253- files = setup_files (class_dir , seed , group_prefix , group , formats )
264+ files = setup_files (class_dir , seed , group_prefix , group , formats , shuffle )
254265
255266 # Partition files into k roughly equal chunks
256267 fold_size = len (files ) // k
@@ -271,25 +282,26 @@ def split_class_dir_kfold(class_dir, output, k, seed, prog_bar, group_prefix, gr
271282 copy_files (li , class_dir , fold_output , prog_bar , move )
272283
273284
274- def setup_files (class_dir , seed , group_prefix = None , group = None , formats = None ):
285+ def setup_files (class_dir , seed , group_prefix = None , group = None , formats = None , shuffle = True ):
275286 """
276- Returns shuffled list of filenames
287+ Returns sorted (and optionally shuffled) list of filenames
277288 """
278289 random .seed (seed ) # make sure its reproducible
279290
280291 files = list_files (class_dir , formats )
281292 files = resolve_grouping (files , group_prefix , group )
282293
283294 files .sort ()
284- random .shuffle (files )
295+ if shuffle :
296+ random .shuffle (files )
285297 return files
286298
287299
288- def split_class_dir_ratio (class_dir , output , ratio , seed , prog_bar , group_prefix , group , move , formats ):
300+ def split_class_dir_ratio (class_dir , output , ratio , seed , prog_bar , group_prefix , group , move , formats , shuffle = True ):
289301 """
290302 Splits a class folder
291303 """
292- files = setup_files (class_dir , seed , group_prefix , group , formats )
304+ files = setup_files (class_dir , seed , group_prefix , group , formats , shuffle )
293305
294306 # the data was shuffled already
295307 split_train_idx = int (ratio [0 ] * len (files ))
@@ -299,11 +311,11 @@ def split_class_dir_ratio(class_dir, output, ratio, seed, prog_bar, group_prefix
299311 copy_files (li , class_dir , output , prog_bar , move )
300312
301313
302- def split_class_dir_fixed (class_dir , output , fixed , seed , prog_bar , group_prefix , group , move , formats ):
314+ def split_class_dir_fixed (class_dir , output , fixed , seed , prog_bar , group_prefix , group , move , formats , shuffle = True ):
303315 """
304316 Splits a class folder and returns the total number of files
305317 """
306- files = setup_files (class_dir , seed , group_prefix , group , formats )
318+ files = setup_files (class_dir , seed , group_prefix , group , formats , shuffle )
307319
308320 if not len (files ) >= sum (fixed ):
309321 raise ValueError (
@@ -389,8 +401,8 @@ def copy_sibling_files(files_type, type_dir_names, output, prog_bar, move):
389401 copy_fn (f , full_path )
390402
391403
392- def split_sibling_dirs_ratio (input_dir , output , ratio , seed , prog_bar , move , formats ):
393- type_dir_names , groups = setup_sibling_files (input_dir , seed , formats )
404+ def split_sibling_dirs_ratio (input_dir , output , ratio , seed , prog_bar , move , formats , shuffle = True ):
405+ type_dir_names , groups = setup_sibling_files (input_dir , seed , formats , shuffle )
394406
395407 split_train_idx = int (ratio [0 ] * len (groups ))
396408 split_val_idx = split_train_idx + int (ratio [1 ] * len (groups ))
@@ -399,8 +411,8 @@ def split_sibling_dirs_ratio(input_dir, output, ratio, seed, prog_bar, move, for
399411 copy_sibling_files (li , type_dir_names , output , prog_bar , move )
400412
401413
402- def split_sibling_dirs_fixed (input_dir , output , fixed , seed , prog_bar , move , formats ):
403- type_dir_names , groups = setup_sibling_files (input_dir , seed , formats )
414+ def split_sibling_dirs_fixed (input_dir , output , fixed , seed , prog_bar , move , formats , shuffle = True ):
415+ type_dir_names , groups = setup_sibling_files (input_dir , seed , formats , shuffle )
404416
405417 if not len (groups ) >= sum (fixed ):
406418 raise ValueError (
@@ -425,8 +437,8 @@ def split_sibling_dirs_fixed(input_dir, output, fixed, seed, prog_bar, move, for
425437 copy_sibling_files (li , type_dir_names , output , prog_bar , move )
426438
427439
428- def split_sibling_dirs_kfold (input_dir , output , k , seed , prog_bar , move , formats ):
429- type_dir_names , groups = setup_sibling_files (input_dir , seed , formats )
440+ def split_sibling_dirs_kfold (input_dir , output , k , seed , prog_bar , move , formats , shuffle = True ):
441+ type_dir_names , groups = setup_sibling_files (input_dir , seed , formats , shuffle )
430442
431443 fold_size = len (groups ) // k
432444 remainder = len (groups ) % k
0 commit comments