@@ -89,7 +89,7 @@ def introduce_null_values_movielens(extract_dir):
8989 # Seed for reproducibility - same NULLs every time
9090 random .seed (42 )
9191
92- print ("\n 🔧 Systematically introducing NULL values in MovieLens CSV files..." )
92+ print ("\n [WORK] Systematically introducing NULL values in MovieLens CSV files..." )
9393 overall_start = time .time ()
9494
9595 # Configuration: file -> {nullable_field: percentage}
@@ -155,10 +155,10 @@ def introduce_null_values_movielens(extract_dir):
155155 null_summary = ", " .join (
156156 f"{ count } NULL { field } " for field , count in null_counts .items ()
157157 )
158- print (f" ✅ { filename } : { null_summary } ({ elapsed :.2f} s)" )
158+ print (f" [OK] { filename } : { null_summary } ({ elapsed :.2f} s)" )
159159
160160 overall_elapsed = time .time () - overall_start
161- print (f"\n ⏱️ Total CSV NULL injection time: { overall_elapsed :.2f} s" )
161+ print (f"\n [TIME] Total CSV NULL injection time: { overall_elapsed :.2f} s" )
162162
163163
164164def download_movielens (size = "large" , inject_nulls = True ):
@@ -200,7 +200,7 @@ def download_movielens(size="large", inject_nulls=True):
200200
201201 # Check if already downloaded
202202 if extract_dir .exists ():
203- print (f"✅ Dataset already exists at: { extract_dir } " )
203+ print (f"[OK] Dataset already exists at: { extract_dir } " )
204204 print (f" Size: { config ['description' ]} ({ config ['size_mb' ]} )" )
205205 print ()
206206 for csv_file in ["movies.csv" , "ratings.csv" , "tags.csv" , "links.csv" ]:
@@ -210,10 +210,12 @@ def download_movielens(size="large", inject_nulls=True):
210210 print (f" - { csv_file } : { size_mb :.1f} MB" )
211211
212212 # Ask if user wants to re-introduce NULL values
213- print ("\n 💡 To re-introduce NULL values, delete the data directory and re-run." )
213+ print (
214+ "\n [INFO] To re-introduce NULL values, delete the data directory and re-run."
215+ )
214216 return extract_dir
215217
216- print (f"📥 Downloading MovieLens { size } dataset" )
218+ print (f"[DOWNLOAD] Downloading MovieLens { size } dataset" )
217219 print (f" Description: { config ['description' ]} ({ config ['size_mb' ]} )" )
218220 print (f" URL: { url } " )
219221 print (" This may take a few minutes..." )
@@ -237,11 +239,11 @@ def report_progress(block_num, block_size, total_size):
237239 urllib .request .urlretrieve (url , zip_path , reporthook = report_progress )
238240 print () # New line after progress
239241 download_elapsed = time .time () - download_start
240- print (f"✅ Downloaded to: { zip_path } " f"({ download_elapsed :.2f} s)" )
242+ print (f"[OK] Downloaded to: { zip_path } " f"({ download_elapsed :.2f} s)" )
241243
242244 # Extract
243245 extract_start = time .time ()
244- print ("📦 Extracting..." )
246+ print ("[EXTRACT] Extracting..." )
245247 with zipfile .ZipFile (zip_path , "r" ) as zip_ref :
246248 # Extract to temp dir (zip contains ml-latest/ml-latest-small)
247249 temp_extract = data_dir / "temp_extract"
@@ -261,28 +263,28 @@ def report_progress(block_num, block_size, total_size):
261263 raise Exception (f"Unexpected zip structure in { temp_extract } " )
262264
263265 extract_elapsed = time .time () - extract_start
264- print (f"✅ Extracted to: { extract_dir } ({ extract_elapsed :.2f} s)" )
266+ print (f"[OK] Extracted to: { extract_dir } ({ extract_elapsed :.2f} s)" )
265267
266268 # Introduce NULL values for testing
267269 if inject_nulls :
268270 introduce_null_values_movielens (extract_dir )
269271 else :
270- print ("\n ⏭️ Skipping NULL value injection (--no-nulls flag)" )
272+ print ("\n [SKIP] Skipping NULL value injection (--no-nulls flag)" )
271273
272274 # Show file sizes
273- print ("\n 📊 Dataset contents:" )
275+ print ("\n [STATS] Dataset contents:" )
274276 for csv_file in extract_dir .glob ("*.csv" ):
275277 size_mb = csv_file .stat ().st_size / (1024 * 1024 )
276278 print (f" - { csv_file .name } : { size_mb :.1f} MB" )
277279
278280 # Clean up zip file
279281 zip_path .unlink ()
280- print ("\n 🧹 Cleaned up zip file" )
282+ print ("\n [CLEAN] Cleaned up zip file" )
281283
282284 return extract_dir
283285
284286 except Exception as e :
285- print (f"❌ Error downloading dataset: { e } " )
287+ print (f"[ERROR] Error downloading dataset: { e } " )
286288 print (f" You can manually download from: { url } " )
287289 raise
288290
@@ -296,7 +298,7 @@ def download_stackoverflow(size="small"):
296298 try :
297299 import py7zr
298300 except ImportError :
299- print ("❌ Missing dependency: py7zr" )
301+ print ("[ERROR] Missing dependency: py7zr" )
300302 print (" Install with: pip install py7zr" )
301303 raise
302304
@@ -366,7 +368,7 @@ def download_stackoverflow(size="small"):
366368
367369 # Check if already downloaded
368370 if extract_dir .exists ():
369- print (f"✅ Dataset already exists at: { extract_dir } " )
371+ print (f"[OK] Dataset already exists at: { extract_dir } " )
370372 print (f" Site: { config ['site' ]} " )
371373 print (f" Size: { config ['description' ]} ({ config ['size_mb' ]} )" )
372374 print (f" Date: { config ['date' ]} " )
@@ -380,10 +382,10 @@ def download_stackoverflow(size="small"):
380382 size_mb = xml_file .stat ().st_size / (1024 * 1024 )
381383 print (f" - { xml_file .name } : { size_mb :.1f} MB" )
382384
383- print ("\n 💡 To re-download, delete the data directory and re-run." )
385+ print ("\n [INFO] To re-download, delete the data directory and re-run." )
384386 return extract_dir
385387
386- print (f"📥 Downloading Stack Exchange { size } dataset" )
388+ print (f"[DOWNLOAD] Downloading Stack Exchange { size } dataset" )
387389 print (f" Site: { config ['site' ]} " )
388390 print (f" Description: { config ['description' ]} ({ config ['size_mb' ]} )" )
389391 print (f" Date: { config ['date' ]} (pinned for reproducibility)" )
@@ -411,44 +413,44 @@ def report_progress(block_num, block_size, total_size):
411413 for url in config ["urls" ]:
412414 filename = url .split ("/" )[- 1 ]
413415 archive_path = data_dir / filename
414- print (f"\n 📥 Downloading { filename } " )
416+ print (f"\n [DOWNLOAD] Downloading { filename } " )
415417 file_start = time .time ()
416418 urllib .request .urlretrieve (url , archive_path , reporthook = report_progress )
417419 print () # New line after progress
418420 file_elapsed = time .time () - file_start
419- print (f"✅ Downloaded to: { archive_path } ({ file_elapsed :.2f} s)" )
421+ print (f"[OK] Downloaded to: { archive_path } ({ file_elapsed :.2f} s)" )
420422
421423 # Extract 7z file
422424 extract_start = time .time ()
423- print (f"📦 Extracting { filename } ..." )
425+ print (f"[EXTRACT] Extracting { filename } ..." )
424426 with py7zr .SevenZipFile (archive_path , mode = "r" ) as archive :
425427 archive .extractall (path = extract_dir )
426428
427429 extract_elapsed = time .time () - extract_start
428- print (f"✅ Extracted ({ extract_elapsed :.2f} s)" )
430+ print (f"[OK] Extracted ({ extract_elapsed :.2f} s)" )
429431
430432 # Clean up archive file
431433 archive_path .unlink ()
432434
433435 download_elapsed = time .time () - download_start
434- print (f"\n ⏱️ Total download time: { download_elapsed :.2f} s" )
436+ print (f"\n [TIME] Total download time: { download_elapsed :.2f} s" )
435437
436- print (f"\n ✅ Extracted to: { extract_dir } " )
438+ print (f"\n [OK] Extracted to: { extract_dir } " )
437439
438440 # Show file sizes
439- print ("\n 📊 Dataset contents:" )
441+ print ("\n [STATS] Dataset contents:" )
440442 xml_files = list (extract_dir .glob ("*.xml" ))
441443 if xml_files :
442444 for xml_file in sorted (xml_files ):
443445 size_mb = xml_file .stat ().st_size / (1024 * 1024 )
444446 print (f" - { xml_file .name } : { size_mb :.1f} MB" )
445447 else :
446- print (" ⚠️ No XML files found" )
448+ print (" [WARNING] No XML files found" )
447449
448450 return extract_dir
449451
450452 except Exception as e :
451- print (f"❌ Error downloading dataset: { e } " )
453+ print (f"[ERROR] Error downloading dataset: { e } " )
452454 print (f" You can manually download from: { config ['urls' ][0 ]} " )
453455 raise
454456
@@ -531,7 +533,7 @@ def verify_csv_nulls(extract_dir, dataset_type="movielens", sample_size=None):
531533 results [filename ] = file_results
532534
533535 verification_elapsed = time .time () - verification_start
534- print (f"\n ⏱️ CSV verification time: { verification_elapsed :.2f} s" )
536+ print (f"\n [TIME] CSV verification time: { verification_elapsed :.2f} s" )
535537 return results
536538
537539
@@ -638,44 +640,44 @@ def verify_xml_nulls(extract_dir, sample_size=None):
638640 }
639641
640642 verification_elapsed = time .time () - verification_start
641- print (f"\n ⏱️ XML verification time: { verification_elapsed :.2f} s" )
643+ print (f"\n [TIME] XML verification time: { verification_elapsed :.2f} s" )
642644 return results
643645
644646
645647def print_verification_report (csv_results , xml_results , inject_nulls ):
646648 """Print verification report."""
647649 print ()
648650 print ("=" * 70 )
649- print ("📊 Dataset Verification Report" )
651+ print ("[STATS] Dataset Verification Report" )
650652 print ("=" * 70 )
651653 print ()
652654
653655 # CSV verification (MovieLens)
654656 if csv_results :
655657 if inject_nulls :
656- print ("✅ NULL injection was ENABLED" )
658+ print ("[OK] NULL injection was ENABLED" )
657659 print ()
658660
659661 print ("CSV Files:" )
660662 for filename , data in csv_results .items ():
661663 sampled_note = " (sampled)" if data .get ("sampled" ) else ""
662- print (f" 📄 { filename } { sampled_note } :" )
664+ print (f" [FILE] { filename } { sampled_note } :" )
663665 print (f" Total rows: { data ['total_rows' ]} " )
664666 for field , count in data ["null_counts" ].items ():
665667 pct = (count / data ["total_rows" ]) * 100
666- status = "✅ " if count > 0 else "❌ "
668+ status = "[OK] " if count > 0 else "[ERROR] "
667669 print (f" { status } NULL { field } : { count } ({ pct :.1f} %)" )
668670 print ()
669671
670672 # XML verification (Stack Exchange)
671673 if xml_results :
672674 print ("XML Files:" )
673675 print ()
674- print (" 📊 Stack Exchange data (original, unmodified)" )
676+ print (" [STATS] Stack Exchange data (original, unmodified)" )
675677 print ()
676678 for filename , data in xml_results .items ():
677679 sampled_note = " (sampled)" if data .get ("sampled" ) else ""
678- print (f" 📄 { filename } { sampled_note } :" )
680+ print (f" [FILE] { filename } { sampled_note } :" )
679681 print (f" Total rows: { data ['total_rows' ]} " )
680682 print (f" Unique attributes: { data ['total_attributes' ]} " )
681683 if data ["null_counts" ]:
@@ -690,7 +692,7 @@ def print_verification_report(csv_results, xml_results, inject_nulls):
690692 print ()
691693
692694 print ("=" * 70 )
693- print ("✅ Verification Complete" )
695+ print ("[OK] Verification Complete" )
694696 print ("=" * 70 )
695697
696698
@@ -761,7 +763,11 @@ def main():
761763 args = parser .parse_args ()
762764
763765 print ("=" * 70 )
764- print ("📥 Dataset Download" if not args .verify_only else "📊 Dataset Verification" )
766+ print (
767+ "[DOWNLOAD] Dataset Download"
768+ if not args .verify_only
769+ else "[STATS] Dataset Verification"
770+ )
765771 print ("=" * 70 )
766772 print ()
767773
@@ -779,12 +785,12 @@ def main():
779785 # Verify-only mode
780786 if args .verify_only :
781787 if not extract_dir .exists ():
782- print (f"❌ Dataset not found: { extract_dir } " )
788+ print (f"[ERROR] Dataset not found: { extract_dir } " )
783789 print (" Run without --verify-only to download first." )
784790 return
785791
786- print (f"📂 Verifying existing dataset: { extract_dir } " )
787- print ("⚡ Using smart sampling (100K rows) for fast verification" )
792+ print (f"[DIR] Verifying existing dataset: { extract_dir } " )
793+ print ("[FAST] Using smart sampling (100K rows) for fast verification" )
788794 print ()
789795
790796 sample_size = 100000
@@ -810,15 +816,15 @@ def main():
810816
811817 print ()
812818 print ("=" * 70 )
813- print ("✅ MovieLens Dataset Ready!" )
819+ print ("[OK] MovieLens Dataset Ready!" )
814820 print ("=" * 70 )
815821 print ()
816- print ("💡 Use this dataset in examples:" )
822+ print ("[INFO] Use this dataset in examples:" )
817823 print (f" data_dir = Path('{ extract_dir } ')" )
818824 print (" movies_csv = data_dir / 'movies.csv'" )
819825 print (" ratings_csv = data_dir / 'ratings.csv'" )
820826 print ()
821- print ("📚 Dataset info:" )
827+ print ("[INFO] Dataset info:" )
822828 if size == "large" :
823829 print (" - ~86,000 movies" )
824830 print (" - ~33,000,000 ratings" )
@@ -843,15 +849,15 @@ def main():
843849
844850 print ()
845851 print ("=" * 70 )
846- print ("✅ Stack Exchange Dataset Ready!" )
852+ print ("[OK] Stack Exchange Dataset Ready!" )
847853 print ("=" * 70 )
848854 print ()
849- print ("💡 Use this dataset in examples:" )
855+ print ("[INFO] Use this dataset in examples:" )
850856 print (f" data_dir = Path('{ extract_dir } ')" )
851857 print (" posts_xml = data_dir / 'Posts.xml'" )
852858 print (" users_xml = data_dir / 'Users.xml'" )
853859 print ()
854- print ("📚 Dataset info:" )
860+ print ("[INFO] Dataset info:" )
855861 if size == "small" :
856862 print (" - Site: cs.stackexchange.com" )
857863 print (" - ~80,000 posts (questions + answers)" )
0 commit comments