huggingface · DaoyuanLi2816 · Jun 16, 2026
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -4085,7 +4085,7 @@ def batch(
                 Successive examples with the same value for that column are in grouped the same batch.
                 This can also be a list of columns if you want to batch by multiple columns.
                 If batching by column, the batch_size is only used to control the size of the batches
-                to group together or slice during acculumation.
+                to group together or slice during accumulation.
 
                 <Added version="4.9.0"/>
             drop_last_batch (`bool`, defaults to `False`):
@@ -7176,7 +7176,7 @@ def _interleave_map_style_datasets(
 
         # Reasoning behind the following operation: keeping the first indices of each dataset
         # while offsetting in order to correspond to the right indices of the concatenated dataset
-        # and flattening to effectively interleave the datasets. Then we remove the exausted datasets
+        # and flattening to effectively interleave the datasets. Then we remove the exhausted datasets
         # and we continue with the following indices, until all datasets are exhausted
         chunks_boundaries = [0] + sorted(set(lengths))
         chunks = zip(chunks_boundaries[:-1], chunks_boundaries[1:])

diff --git a/src/datasets/arrow_reader.py b/src/datasets/arrow_reader.py
@@ -194,7 +194,7 @@ def _read_files(self, files, in_memory=False) -> Table:
             in_memory (bool, default False): Whether to copy the data in-memory.
         """
         if len(files) == 0 or not all(isinstance(f, dict) for f in files):
-            raise ValueError("please provide valid file informations")
+            raise ValueError("please provide valid file information")
         files = copy.deepcopy(files)
         for f in files:
             f["filename"] = os.path.join(self._path, f["filename"])
@@ -499,7 +499,7 @@ def _init(self, relative_instructions):
     @classmethod
     def _read_instruction_from_relative_instructions(cls, relative_instructions):
         """Returns ReadInstruction obj initialized with relative_instructions."""
-        # Use __new__ to bypass __init__ used by public API and not conveniant here.
+        # Use __new__ to bypass __init__ used by public API and not convenient here.
         result = cls.__new__(cls)
         result._init(relative_instructions)  # pylint: disable=protected-access
         return result

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -550,7 +550,7 @@ def __init__(
         self.hkey_record = []
 
     def __len__(self):
-        """Return the number of writed and staged examples"""
+        """Return the number of written and staged examples"""
         return self._num_examples + len(self.current_examples) + len(self.current_rows)
 
     def __enter__(self):

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -719,7 +719,7 @@ def download_and_prepare(
 
                 <Added version="2.9.1"/>
             dl_manager (`DownloadManager`, *optional*):
-                Specific `DownloadManger` to use.
+                Specific `DownloadManager` to use.
             base_path (`str`, *optional*):
                 Base path for relative paths that are used to download files. This can be a remote url.
                 If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.

diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -742,7 +742,7 @@ class DataFilesPatternsList(list[str]):
     """
     List of data files patterns (absolute local paths or URLs).
     For each pattern there should also be a list of allowed extensions
-    to keep, or a None ot keep all the files for the pattern.
+    to keep, or a None to keep all the files for the pattern.
     """
 
     def __init__(

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -1841,7 +1841,7 @@ def __repr__(self):
     @property
     def num_columns(self) -> dict[str, Optional[int]]:
         """Number of columns in each split of the dataset.
-        This can contain None valies if some splits have unknown features (e.g. after a map() operation).
+        This can contain None values if some splits have unknown features (e.g. after a map() operation).
 
         Example:
 
@@ -1858,7 +1858,7 @@ def num_columns(self) -> dict[str, Optional[int]]:
     @property
     def column_names(self) -> dict[str, Optional[list[str]]]:
         """Names of the columns in each split of the dataset.
-        This can contain None valies if some splits have unknown features (e.g. after a map() operation).
+        This can contain None values if some splits have unknown features (e.g. after a map() operation).
 
         Example:
 
@@ -2431,7 +2431,7 @@ def push_to_hub(
         >>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr")
         ```
         """
-        # check to make sure that the user doesnt specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing
+        # check to make sure that the user doesn't specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing
         if max_shard_size is not None and num_shards is not None:
             raise ValueError(
                 "Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both."

diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -1875,7 +1875,7 @@ class Features(dict):
           or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key).
           This feature loads the NIfTI file lazily with nibabel.
         - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation.
-        - [`Json`] feature to store unstructred data, e.g. containing mixed/abritrary types. Under the hood
+        - [`Json`] feature to store unstructured data, e.g. containing mixed/arbitrary types. Under the hood
     """
 
     def __init__(*args, **kwargs):

diff --git a/src/datasets/filesystems/compression.py b/src/datasets/filesystems/compression.py
@@ -23,7 +23,7 @@ def __init__(
         The compressed file system can be instantiated from any compressed file.
         It reads the contents of compressed file as a filesystem with one file inside, as if it was an archive.
 
-        The single file inside the filesystem is named after the compresssed file,
+        The single file inside the filesystem is named after the compressed file,
         without the compression extension at the end of the filename.
 
         Args:

diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py
@@ -315,7 +315,7 @@ def validate_fingerprint(fingerprint: str, max_length=64):
             )
     if len(fingerprint) > max_length:
         raise ValueError(
-            f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
+            f"Invalid fingerprint. Maximum length is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
             "It could create issues when creating cache files."
         )
 

diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -1385,7 +1385,7 @@ def __init__(
             elif not ex_iterable.iter_arrow:
                 raise ValueError(
                     f"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has underlying iterable "
-                    f"that is a {type(ex_iterable).__name__} but doesnt' implement iter_arrow(), a possible fix could be "
+                    f"that is a {type(ex_iterable).__name__} but doesn't implement iter_arrow(), a possible fix could be "
                     "to use RebatchedArrowExamplesIterable(..., force_convert_to_arrow=True)."
                 )
             elif ex_iterable.batch_size != (batch_size if batched else 1):
@@ -3466,7 +3466,7 @@ def map(
           Note that the last batch may have less than `n` examples.
           A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.
 
-        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.
+        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.
         It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
 
         Args:
@@ -3633,7 +3633,7 @@ def filter(
         """Apply a filter function to all the elements so that the dataset only includes examples according to the filter function.
         The filtering is done on-the-fly when iterating over the dataset.
 
-        If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).
+        If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable).
         It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
 
         Args:
@@ -4405,7 +4405,7 @@ def batch(
                 Successive examples with the same value for that column are in grouped the same batch.
                 This can also be a list of columns if you want to batch by multiple columns.
                 If batching by column, the batch_size is only used to control the size of the batches
-                to group together or slice during acculumation.
+                to group together or slice during accumulation.
 
                 <Added version="4.9.0"/>
             drop_last_batch (`bool`, defaults to `False`):

diff --git a/src/datasets/parallel/parallel.py b/src/datasets/parallel/parallel.py
@@ -44,7 +44,7 @@ def _map_with_multiprocessing_pool(
     function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func
 ):
     num_proc = num_proc if num_proc <= len(iterable) else len(iterable)
-    split_kwds = []  # We organize the splits ourselve (contiguous splits)
+    split_kwds = []  # We organize the splits ourselves (contiguous splits)
     for index in range(num_proc):
         div = len(iterable) // num_proc
         mod = len(iterable) % num_proc

diff --git a/src/datasets/utils/sharding.py b/src/datasets/utils/sharding.py
@@ -3,7 +3,7 @@
 
 def _number_of_shards_in_gen_kwargs(gen_kwargs: dict) -> int:
     """Return the number of possible shards according to the input gen_kwargs"""
-    # Having lists of different sizes makes sharding ambigious, raise an error in this case
+    # Having lists of different sizes makes sharding ambiguous, raise an error in this case
     # until we decide how to define sharding without ambiguity for users
     lists_lengths = {key: len(value) for key, value in gen_kwargs.items() if isinstance(value, list)}
     if len(set(lists_lengths.values())) > 1:
@@ -47,7 +47,7 @@ def _distribute_shards(num_shards: int, max_num_jobs: int) -> list[range]:
 
 def _split_gen_kwargs(gen_kwargs: dict, max_num_jobs: int) -> list[dict]:
     """Split the gen_kwargs into `max_num_job` gen_kwargs"""
-    # Having lists of different sizes makes sharding ambigious, raise an error in this case
+    # Having lists of different sizes makes sharding ambiguous, raise an error in this case
     num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs)
     if num_shards == 1:
         return [dict(gen_kwargs)]