diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 59451a640e6..8b967b795b4 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -4085,7 +4085,7 @@ def batch( Successive examples with the same value for that column are in grouped the same batch. This can also be a list of columns if you want to batch by multiple columns. If batching by column, the batch_size is only used to control the size of the batches - to group together or slice during acculumation. + to group together or slice during accumulation. drop_last_batch (`bool`, defaults to `False`): @@ -7176,7 +7176,7 @@ def _interleave_map_style_datasets( # Reasoning behind the following operation: keeping the first indices of each dataset # while offsetting in order to correspond to the right indices of the concatenated dataset - # and flattening to effectively interleave the datasets. Then we remove the exausted datasets + # and flattening to effectively interleave the datasets. Then we remove the exhausted datasets # and we continue with the following indices, until all datasets are exhausted chunks_boundaries = [0] + sorted(set(lengths)) chunks = zip(chunks_boundaries[:-1], chunks_boundaries[1:]) diff --git a/src/datasets/arrow_reader.py b/src/datasets/arrow_reader.py index d9cf2cf0f4b..e4946e67517 100644 --- a/src/datasets/arrow_reader.py +++ b/src/datasets/arrow_reader.py @@ -194,7 +194,7 @@ def _read_files(self, files, in_memory=False) -> Table: in_memory (bool, default False): Whether to copy the data in-memory. """ if len(files) == 0 or not all(isinstance(f, dict) for f in files): - raise ValueError("please provide valid file informations") + raise ValueError("please provide valid file information") files = copy.deepcopy(files) for f in files: f["filename"] = os.path.join(self._path, f["filename"]) @@ -499,7 +499,7 @@ def _init(self, relative_instructions): @classmethod def _read_instruction_from_relative_instructions(cls, relative_instructions): """Returns ReadInstruction obj initialized with relative_instructions.""" - # Use __new__ to bypass __init__ used by public API and not conveniant here. + # Use __new__ to bypass __init__ used by public API and not convenient here. result = cls.__new__(cls) result._init(relative_instructions) # pylint: disable=protected-access return result diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 2bd6f9fc613..bdfa39c7ec5 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -550,7 +550,7 @@ def __init__( self.hkey_record = [] def __len__(self): - """Return the number of writed and staged examples""" + """Return the number of written and staged examples""" return self._num_examples + len(self.current_examples) + len(self.current_rows) def __enter__(self): diff --git a/src/datasets/builder.py b/src/datasets/builder.py index ab677af89f0..65d72435a8f 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -719,7 +719,7 @@ def download_and_prepare( dl_manager (`DownloadManager`, *optional*): - Specific `DownloadManger` to use. + Specific `DownloadManager` to use. base_path (`str`, *optional*): Base path for relative paths that are used to download files. This can be a remote url. If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead. diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 3e63c3ac65f..630e3f95d9f 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -742,7 +742,7 @@ class DataFilesPatternsList(list[str]): """ List of data files patterns (absolute local paths or URLs). For each pattern there should also be a list of allowed extensions - to keep, or a None ot keep all the files for the pattern. + to keep, or a None to keep all the files for the pattern. """ def __init__( diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 4abea0a381a..ed5c34f026b 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1841,7 +1841,7 @@ def __repr__(self): @property def num_columns(self) -> dict[str, Optional[int]]: """Number of columns in each split of the dataset. - This can contain None valies if some splits have unknown features (e.g. after a map() operation). + This can contain None values if some splits have unknown features (e.g. after a map() operation). Example: @@ -1858,7 +1858,7 @@ def num_columns(self) -> dict[str, Optional[int]]: @property def column_names(self) -> dict[str, Optional[list[str]]]: """Names of the columns in each split of the dataset. - This can contain None valies if some splits have unknown features (e.g. after a map() operation). + This can contain None values if some splits have unknown features (e.g. after a map() operation). Example: @@ -2431,7 +2431,7 @@ def push_to_hub( >>> french_dataset = load_dataset("/", "fr") ``` """ - # check to make sure that the user doesnt specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing + # check to make sure that the user doesn't specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing if max_shard_size is not None and num_shards is not None: raise ValueError( "Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both." diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 17b3d441960..9aacc774cfb 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1875,7 +1875,7 @@ class Features(dict): or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key). This feature loads the NIfTI file lazily with nibabel. - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation. - - [`Json`] feature to store unstructred data, e.g. containing mixed/abritrary types. Under the hood + - [`Json`] feature to store unstructured data, e.g. containing mixed/arbitrary types. Under the hood """ def __init__(*args, **kwargs): diff --git a/src/datasets/filesystems/compression.py b/src/datasets/filesystems/compression.py index 237b06d7500..92ababc9a3a 100644 --- a/src/datasets/filesystems/compression.py +++ b/src/datasets/filesystems/compression.py @@ -23,7 +23,7 @@ def __init__( The compressed file system can be instantiated from any compressed file. It reads the contents of compressed file as a filesystem with one file inside, as if it was an archive. - The single file inside the filesystem is named after the compresssed file, + The single file inside the filesystem is named after the compressed file, without the compression extension at the end of the filename. Args: diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index 13b801621bb..6a18c7c3497 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -315,7 +315,7 @@ def validate_fingerprint(fingerprint: str, max_length=64): ) if len(fingerprint) > max_length: raise ValueError( - f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}." + f"Invalid fingerprint. Maximum length is {max_length} but '{fingerprint}' has length {len(fingerprint)}." "It could create issues when creating cache files." ) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 92fdea2ad4d..22cec4b0b86 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -1385,7 +1385,7 @@ def __init__( elif not ex_iterable.iter_arrow: raise ValueError( f"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has underlying iterable " - f"that is a {type(ex_iterable).__name__} but doesnt' implement iter_arrow(), a possible fix could be " + f"that is a {type(ex_iterable).__name__} but doesn't implement iter_arrow(), a possible fix could be " "to use RebatchedArrowExamplesIterable(..., force_convert_to_arrow=True)." ) elif ex_iterable.batch_size != (batch_size if batched else 1): @@ -3466,7 +3466,7 @@ def map( Note that the last batch may have less than `n` examples. A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`. - If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls. + If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls. It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. Args: @@ -3633,7 +3633,7 @@ def filter( """Apply a filter function to all the elements so that the dataset only includes examples according to the filter function. The filtering is done on-the-fly when iterating over the dataset. - If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable). + If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable). It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. Args: @@ -4405,7 +4405,7 @@ def batch( Successive examples with the same value for that column are in grouped the same batch. This can also be a list of columns if you want to batch by multiple columns. If batching by column, the batch_size is only used to control the size of the batches - to group together or slice during acculumation. + to group together or slice during accumulation. drop_last_batch (`bool`, defaults to `False`): diff --git a/src/datasets/parallel/parallel.py b/src/datasets/parallel/parallel.py index 5cad2c48ba2..0606b97b5ff 100644 --- a/src/datasets/parallel/parallel.py +++ b/src/datasets/parallel/parallel.py @@ -44,7 +44,7 @@ def _map_with_multiprocessing_pool( function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func ): num_proc = num_proc if num_proc <= len(iterable) else len(iterable) - split_kwds = [] # We organize the splits ourselve (contiguous splits) + split_kwds = [] # We organize the splits ourselves (contiguous splits) for index in range(num_proc): div = len(iterable) // num_proc mod = len(iterable) % num_proc diff --git a/src/datasets/utils/sharding.py b/src/datasets/utils/sharding.py index 753bdf60c2a..c292f2906cb 100644 --- a/src/datasets/utils/sharding.py +++ b/src/datasets/utils/sharding.py @@ -3,7 +3,7 @@ def _number_of_shards_in_gen_kwargs(gen_kwargs: dict) -> int: """Return the number of possible shards according to the input gen_kwargs""" - # Having lists of different sizes makes sharding ambigious, raise an error in this case + # Having lists of different sizes makes sharding ambiguous, raise an error in this case # until we decide how to define sharding without ambiguity for users lists_lengths = {key: len(value) for key, value in gen_kwargs.items() if isinstance(value, list)} if len(set(lists_lengths.values())) > 1: @@ -47,7 +47,7 @@ def _distribute_shards(num_shards: int, max_num_jobs: int) -> list[range]: def _split_gen_kwargs(gen_kwargs: dict, max_num_jobs: int) -> list[dict]: """Split the gen_kwargs into `max_num_job` gen_kwargs""" - # Having lists of different sizes makes sharding ambigious, raise an error in this case + # Having lists of different sizes makes sharding ambiguous, raise an error in this case num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs) if num_shards == 1: return [dict(gen_kwargs)]