From 7cef1bac905dc477429c74f4748386a5a0f950db Mon Sep 17 00:00:00 2001
From: Daoyuan Li <94409450+DaoyuanLi2816@users.noreply.github.com>
Date: Tue, 16 Jun 2026 12:05:19 -0700
Subject: [PATCH] Fix typos in docstrings and comments
---
src/datasets/arrow_dataset.py | 4 ++--
src/datasets/arrow_reader.py | 4 ++--
src/datasets/arrow_writer.py | 2 +-
src/datasets/builder.py | 2 +-
src/datasets/data_files.py | 2 +-
src/datasets/dataset_dict.py | 6 +++---
src/datasets/features/features.py | 2 +-
src/datasets/filesystems/compression.py | 2 +-
src/datasets/fingerprint.py | 2 +-
src/datasets/iterable_dataset.py | 8 ++++----
src/datasets/parallel/parallel.py | 2 +-
src/datasets/utils/sharding.py | 4 ++--
12 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 59451a640e6..8b967b795b4 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -4085,7 +4085,7 @@ def batch(
Successive examples with the same value for that column are in grouped the same batch.
This can also be a list of columns if you want to batch by multiple columns.
If batching by column, the batch_size is only used to control the size of the batches
- to group together or slice during acculumation.
+ to group together or slice during accumulation.
drop_last_batch (`bool`, defaults to `False`):
@@ -7176,7 +7176,7 @@ def _interleave_map_style_datasets(
# Reasoning behind the following operation: keeping the first indices of each dataset
# while offsetting in order to correspond to the right indices of the concatenated dataset
- # and flattening to effectively interleave the datasets. Then we remove the exausted datasets
+ # and flattening to effectively interleave the datasets. Then we remove the exhausted datasets
# and we continue with the following indices, until all datasets are exhausted
chunks_boundaries = [0] + sorted(set(lengths))
chunks = zip(chunks_boundaries[:-1], chunks_boundaries[1:])
diff --git a/src/datasets/arrow_reader.py b/src/datasets/arrow_reader.py
index d9cf2cf0f4b..e4946e67517 100644
--- a/src/datasets/arrow_reader.py
+++ b/src/datasets/arrow_reader.py
@@ -194,7 +194,7 @@ def _read_files(self, files, in_memory=False) -> Table:
in_memory (bool, default False): Whether to copy the data in-memory.
"""
if len(files) == 0 or not all(isinstance(f, dict) for f in files):
- raise ValueError("please provide valid file informations")
+ raise ValueError("please provide valid file information")
files = copy.deepcopy(files)
for f in files:
f["filename"] = os.path.join(self._path, f["filename"])
@@ -499,7 +499,7 @@ def _init(self, relative_instructions):
@classmethod
def _read_instruction_from_relative_instructions(cls, relative_instructions):
"""Returns ReadInstruction obj initialized with relative_instructions."""
- # Use __new__ to bypass __init__ used by public API and not conveniant here.
+ # Use __new__ to bypass __init__ used by public API and not convenient here.
result = cls.__new__(cls)
result._init(relative_instructions) # pylint: disable=protected-access
return result
diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index 2bd6f9fc613..bdfa39c7ec5 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -550,7 +550,7 @@ def __init__(
self.hkey_record = []
def __len__(self):
- """Return the number of writed and staged examples"""
+ """Return the number of written and staged examples"""
return self._num_examples + len(self.current_examples) + len(self.current_rows)
def __enter__(self):
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index ab677af89f0..65d72435a8f 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -719,7 +719,7 @@ def download_and_prepare(
dl_manager (`DownloadManager`, *optional*):
- Specific `DownloadManger` to use.
+ Specific `DownloadManager` to use.
base_path (`str`, *optional*):
Base path for relative paths that are used to download files. This can be a remote url.
If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index 3e63c3ac65f..630e3f95d9f 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -742,7 +742,7 @@ class DataFilesPatternsList(list[str]):
"""
List of data files patterns (absolute local paths or URLs).
For each pattern there should also be a list of allowed extensions
- to keep, or a None ot keep all the files for the pattern.
+ to keep, or a None to keep all the files for the pattern.
"""
def __init__(
diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
index 4abea0a381a..ed5c34f026b 100644
--- a/src/datasets/dataset_dict.py
+++ b/src/datasets/dataset_dict.py
@@ -1841,7 +1841,7 @@ def __repr__(self):
@property
def num_columns(self) -> dict[str, Optional[int]]:
"""Number of columns in each split of the dataset.
- This can contain None valies if some splits have unknown features (e.g. after a map() operation).
+ This can contain None values if some splits have unknown features (e.g. after a map() operation).
Example:
@@ -1858,7 +1858,7 @@ def num_columns(self) -> dict[str, Optional[int]]:
@property
def column_names(self) -> dict[str, Optional[list[str]]]:
"""Names of the columns in each split of the dataset.
- This can contain None valies if some splits have unknown features (e.g. after a map() operation).
+ This can contain None values if some splits have unknown features (e.g. after a map() operation).
Example:
@@ -2431,7 +2431,7 @@ def push_to_hub(
>>> french_dataset = load_dataset("/", "fr")
```
"""
- # check to make sure that the user doesnt specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing
+ # check to make sure that the user doesn't specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing
if max_shard_size is not None and num_shards is not None:
raise ValueError(
"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both."
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
index 17b3d441960..9aacc774cfb 100644
--- a/src/datasets/features/features.py
+++ b/src/datasets/features/features.py
@@ -1875,7 +1875,7 @@ class Features(dict):
or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key).
This feature loads the NIfTI file lazily with nibabel.
- [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation.
- - [`Json`] feature to store unstructred data, e.g. containing mixed/abritrary types. Under the hood
+ - [`Json`] feature to store unstructured data, e.g. containing mixed/arbitrary types. Under the hood
"""
def __init__(*args, **kwargs):
diff --git a/src/datasets/filesystems/compression.py b/src/datasets/filesystems/compression.py
index 237b06d7500..92ababc9a3a 100644
--- a/src/datasets/filesystems/compression.py
+++ b/src/datasets/filesystems/compression.py
@@ -23,7 +23,7 @@ def __init__(
The compressed file system can be instantiated from any compressed file.
It reads the contents of compressed file as a filesystem with one file inside, as if it was an archive.
- The single file inside the filesystem is named after the compresssed file,
+ The single file inside the filesystem is named after the compressed file,
without the compression extension at the end of the filename.
Args:
diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py
index 13b801621bb..6a18c7c3497 100644
--- a/src/datasets/fingerprint.py
+++ b/src/datasets/fingerprint.py
@@ -315,7 +315,7 @@ def validate_fingerprint(fingerprint: str, max_length=64):
)
if len(fingerprint) > max_length:
raise ValueError(
- f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
+ f"Invalid fingerprint. Maximum length is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
"It could create issues when creating cache files."
)
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
index 92fdea2ad4d..22cec4b0b86 100644
--- a/src/datasets/iterable_dataset.py
+++ b/src/datasets/iterable_dataset.py
@@ -1385,7 +1385,7 @@ def __init__(
elif not ex_iterable.iter_arrow:
raise ValueError(
f"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has underlying iterable "
- f"that is a {type(ex_iterable).__name__} but doesnt' implement iter_arrow(), a possible fix could be "
+ f"that is a {type(ex_iterable).__name__} but doesn't implement iter_arrow(), a possible fix could be "
"to use RebatchedArrowExamplesIterable(..., force_convert_to_arrow=True)."
)
elif ex_iterable.batch_size != (batch_size if batched else 1):
@@ -3466,7 +3466,7 @@ def map(
Note that the last batch may have less than `n` examples.
A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.
- If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.
+ If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
Args:
@@ -3633,7 +3633,7 @@ def filter(
"""Apply a filter function to all the elements so that the dataset only includes examples according to the filter function.
The filtering is done on-the-fly when iterating over the dataset.
- If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).
+ If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable).
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
Args:
@@ -4405,7 +4405,7 @@ def batch(
Successive examples with the same value for that column are in grouped the same batch.
This can also be a list of columns if you want to batch by multiple columns.
If batching by column, the batch_size is only used to control the size of the batches
- to group together or slice during acculumation.
+ to group together or slice during accumulation.
drop_last_batch (`bool`, defaults to `False`):
diff --git a/src/datasets/parallel/parallel.py b/src/datasets/parallel/parallel.py
index 5cad2c48ba2..0606b97b5ff 100644
--- a/src/datasets/parallel/parallel.py
+++ b/src/datasets/parallel/parallel.py
@@ -44,7 +44,7 @@ def _map_with_multiprocessing_pool(
function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func
):
num_proc = num_proc if num_proc <= len(iterable) else len(iterable)
- split_kwds = [] # We organize the splits ourselve (contiguous splits)
+ split_kwds = [] # We organize the splits ourselves (contiguous splits)
for index in range(num_proc):
div = len(iterable) // num_proc
mod = len(iterable) % num_proc
diff --git a/src/datasets/utils/sharding.py b/src/datasets/utils/sharding.py
index 753bdf60c2a..c292f2906cb 100644
--- a/src/datasets/utils/sharding.py
+++ b/src/datasets/utils/sharding.py
@@ -3,7 +3,7 @@
def _number_of_shards_in_gen_kwargs(gen_kwargs: dict) -> int:
"""Return the number of possible shards according to the input gen_kwargs"""
- # Having lists of different sizes makes sharding ambigious, raise an error in this case
+ # Having lists of different sizes makes sharding ambiguous, raise an error in this case
# until we decide how to define sharding without ambiguity for users
lists_lengths = {key: len(value) for key, value in gen_kwargs.items() if isinstance(value, list)}
if len(set(lists_lengths.values())) > 1:
@@ -47,7 +47,7 @@ def _distribute_shards(num_shards: int, max_num_jobs: int) -> list[range]:
def _split_gen_kwargs(gen_kwargs: dict, max_num_jobs: int) -> list[dict]:
"""Split the gen_kwargs into `max_num_job` gen_kwargs"""
- # Having lists of different sizes makes sharding ambigious, raise an error in this case
+ # Having lists of different sizes makes sharding ambiguous, raise an error in this case
num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs)
if num_shards == 1:
return [dict(gen_kwargs)]