Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4085,7 +4085,7 @@ def batch(
Successive examples with the same value for that column are in grouped the same batch.
This can also be a list of columns if you want to batch by multiple columns.
If batching by column, the batch_size is only used to control the size of the batches
to group together or slice during acculumation.
to group together or slice during accumulation.

<Added version="4.9.0"/>
drop_last_batch (`bool`, defaults to `False`):
Expand Down Expand Up @@ -7176,7 +7176,7 @@ def _interleave_map_style_datasets(

# Reasoning behind the following operation: keeping the first indices of each dataset
# while offsetting in order to correspond to the right indices of the concatenated dataset
# and flattening to effectively interleave the datasets. Then we remove the exausted datasets
# and flattening to effectively interleave the datasets. Then we remove the exhausted datasets
# and we continue with the following indices, until all datasets are exhausted
chunks_boundaries = [0] + sorted(set(lengths))
chunks = zip(chunks_boundaries[:-1], chunks_boundaries[1:])
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/arrow_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def _read_files(self, files, in_memory=False) -> Table:
in_memory (bool, default False): Whether to copy the data in-memory.
"""
if len(files) == 0 or not all(isinstance(f, dict) for f in files):
raise ValueError("please provide valid file informations")
raise ValueError("please provide valid file information")
files = copy.deepcopy(files)
for f in files:
f["filename"] = os.path.join(self._path, f["filename"])
Expand Down Expand Up @@ -499,7 +499,7 @@ def _init(self, relative_instructions):
@classmethod
def _read_instruction_from_relative_instructions(cls, relative_instructions):
"""Returns ReadInstruction obj initialized with relative_instructions."""
# Use __new__ to bypass __init__ used by public API and not conveniant here.
# Use __new__ to bypass __init__ used by public API and not convenient here.
result = cls.__new__(cls)
result._init(relative_instructions) # pylint: disable=protected-access
return result
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ def __init__(
self.hkey_record = []

def __len__(self):
"""Return the number of writed and staged examples"""
"""Return the number of written and staged examples"""
return self._num_examples + len(self.current_examples) + len(self.current_rows)

def __enter__(self):
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ def download_and_prepare(

<Added version="2.9.1"/>
dl_manager (`DownloadManager`, *optional*):
Specific `DownloadManger` to use.
Specific `DownloadManager` to use.
base_path (`str`, *optional*):
Base path for relative paths that are used to download files. This can be a remote url.
If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ class DataFilesPatternsList(list[str]):
"""
List of data files patterns (absolute local paths or URLs).
For each pattern there should also be a list of allowed extensions
to keep, or a None ot keep all the files for the pattern.
to keep, or a None to keep all the files for the pattern.
"""

def __init__(
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -1841,7 +1841,7 @@ def __repr__(self):
@property
def num_columns(self) -> dict[str, Optional[int]]:
"""Number of columns in each split of the dataset.
This can contain None valies if some splits have unknown features (e.g. after a map() operation).
This can contain None values if some splits have unknown features (e.g. after a map() operation).

Example:

Expand All @@ -1858,7 +1858,7 @@ def num_columns(self) -> dict[str, Optional[int]]:
@property
def column_names(self) -> dict[str, Optional[list[str]]]:
"""Names of the columns in each split of the dataset.
This can contain None valies if some splits have unknown features (e.g. after a map() operation).
This can contain None values if some splits have unknown features (e.g. after a map() operation).

Example:

Expand Down Expand Up @@ -2431,7 +2431,7 @@ def push_to_hub(
>>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr")
```
"""
# check to make sure that the user doesnt specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing
# check to make sure that the user doesn't specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing
if max_shard_size is not None and num_shards is not None:
raise ValueError(
"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both."
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -1875,7 +1875,7 @@ class Features(dict):
or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key).
This feature loads the NIfTI file lazily with nibabel.
- [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation.
- [`Json`] feature to store unstructred data, e.g. containing mixed/abritrary types. Under the hood
- [`Json`] feature to store unstructured data, e.g. containing mixed/arbitrary types. Under the hood
"""

def __init__(*args, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/filesystems/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(
The compressed file system can be instantiated from any compressed file.
It reads the contents of compressed file as a filesystem with one file inside, as if it was an archive.

The single file inside the filesystem is named after the compresssed file,
The single file inside the filesystem is named after the compressed file,
without the compression extension at the end of the filename.

Args:
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def validate_fingerprint(fingerprint: str, max_length=64):
)
if len(fingerprint) > max_length:
raise ValueError(
f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
f"Invalid fingerprint. Maximum length is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
"It could create issues when creating cache files."
)

Expand Down
8 changes: 4 additions & 4 deletions src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1385,7 +1385,7 @@ def __init__(
elif not ex_iterable.iter_arrow:
raise ValueError(
f"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has underlying iterable "
f"that is a {type(ex_iterable).__name__} but doesnt' implement iter_arrow(), a possible fix could be "
f"that is a {type(ex_iterable).__name__} but doesn't implement iter_arrow(), a possible fix could be "
"to use RebatchedArrowExamplesIterable(..., force_convert_to_arrow=True)."
)
elif ex_iterable.batch_size != (batch_size if batched else 1):
Expand Down Expand Up @@ -3466,7 +3466,7 @@ def map(
Note that the last batch may have less than `n` examples.
A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.

If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.
If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.

Args:
Expand Down Expand Up @@ -3633,7 +3633,7 @@ def filter(
"""Apply a filter function to all the elements so that the dataset only includes examples according to the filter function.
The filtering is done on-the-fly when iterating over the dataset.

If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).
If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable).
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.

Args:
Expand Down Expand Up @@ -4405,7 +4405,7 @@ def batch(
Successive examples with the same value for that column are in grouped the same batch.
This can also be a list of columns if you want to batch by multiple columns.
If batching by column, the batch_size is only used to control the size of the batches
to group together or slice during acculumation.
to group together or slice during accumulation.

<Added version="4.9.0"/>
drop_last_batch (`bool`, defaults to `False`):
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/parallel/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _map_with_multiprocessing_pool(
function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func
):
num_proc = num_proc if num_proc <= len(iterable) else len(iterable)
split_kwds = [] # We organize the splits ourselve (contiguous splits)
split_kwds = [] # We organize the splits ourselves (contiguous splits)
for index in range(num_proc):
div = len(iterable) // num_proc
mod = len(iterable) % num_proc
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/utils/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

def _number_of_shards_in_gen_kwargs(gen_kwargs: dict) -> int:
"""Return the number of possible shards according to the input gen_kwargs"""
# Having lists of different sizes makes sharding ambigious, raise an error in this case
# Having lists of different sizes makes sharding ambiguous, raise an error in this case
# until we decide how to define sharding without ambiguity for users
lists_lengths = {key: len(value) for key, value in gen_kwargs.items() if isinstance(value, list)}
if len(set(lists_lengths.values())) > 1:
Expand Down Expand Up @@ -47,7 +47,7 @@ def _distribute_shards(num_shards: int, max_num_jobs: int) -> list[range]:

def _split_gen_kwargs(gen_kwargs: dict, max_num_jobs: int) -> list[dict]:
"""Split the gen_kwargs into `max_num_job` gen_kwargs"""
# Having lists of different sizes makes sharding ambigious, raise an error in this case
# Having lists of different sizes makes sharding ambiguous, raise an error in this case
num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs)
if num_shards == 1:
return [dict(gen_kwargs)]
Expand Down