@@ -724,19 +724,43 @@ def download_stackoverflow(size="small"):
724724 'large' (~10 GB subset), 'xlarge' (~50 GB subset), or
725725 'full' (~323 GB)
726726 """
727+ required_xml_files = (
728+ "Posts.xml" ,
729+ "Users.xml" ,
730+ "Comments.xml" ,
731+ "Tags.xml" ,
732+ "Badges.xml" ,
733+ "PostLinks.xml" ,
734+ "PostHistory.xml" ,
735+ "Votes.xml" ,
736+ )
737+
738+ def has_required_xml_files (dataset_dir : Path ) -> bool :
739+ return dataset_dir .exists () and all (
740+ (dataset_dir / filename ).exists () for filename in required_xml_files
741+ )
742+
727743 # Create data directory
728744 data_dir = Path (__file__ ).parent / "data"
729745 data_dir .mkdir (exist_ok = True )
730746
731747 if size == "tiny" :
732748 source_dir = data_dir / "stackoverflow-small"
733- if not source_dir .exists ():
749+ if not has_required_xml_files (source_dir ):
750+ print (
751+ "[INFO] stackoverflow-small is missing required XML files; "
752+ "downloading a fresh source dataset"
753+ )
734754 download_stackoverflow (size = "small" )
735755 return create_stackoverflow_tiny (source_dir = source_dir )
736756
737757 if size == "large" :
738758 source_dir = data_dir / "stackoverflow-full"
739- if not source_dir .exists ():
759+ if not has_required_xml_files (source_dir ):
760+ print (
761+ "[INFO] stackoverflow-full is missing required XML files; "
762+ "downloading a fresh source dataset"
763+ )
740764 download_stackoverflow (size = "full" )
741765 return create_stackoverflow_large (
742766 source_dir = source_dir ,
@@ -747,7 +771,11 @@ def download_stackoverflow(size="small"):
747771
748772 if size == "xlarge" :
749773 source_dir = data_dir / "stackoverflow-full"
750- if not source_dir .exists ():
774+ if not has_required_xml_files (source_dir ):
775+ print (
776+ "[INFO] stackoverflow-full is missing required XML files; "
777+ "downloading a fresh source dataset"
778+ )
751779 download_stackoverflow (size = "full" )
752780 return create_stackoverflow_large (
753781 source_dir = source_dir ,
0 commit comments