diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py index 6f3c16fcf..7d7e7611f 100644 --- a/src/llmcompressor/args/dataset_arguments.py +++ b/src/llmcompressor/args/dataset_arguments.py @@ -247,6 +247,14 @@ class DatasetArguments(CustomDatasetArguments): "Default is set to True." }, ) + dataloader_num_workers: int = field( + default=0, + metadata={ + "help": "Number of worker processes for data loading. Set to 0 to disable " + "multiprocessing. Note: Custom data collators may not work with " + "multiprocessing. Default is 0." + }, + ) def is_dataset_provided(self) -> bool: return self.dataset is not None or self.dataset_path is not None diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py index 0d5fceca8..269c45ca7 100644 --- a/src/llmcompressor/datasets/utils.py +++ b/src/llmcompressor/datasets/utils.py @@ -131,6 +131,7 @@ def format_calibration_data( sampler=_make_sampler(args, tokenized_dataset), collate_fn=_make_collate_fn(args, processor), pin_memory=False, + num_workers=args.dataloader_num_workers, ) diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index cd0cf6628..ec5ada3ca 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -261,6 +261,7 @@ def oneshot( streaming: bool = False, overwrite_cache: bool = False, preprocessing_num_workers: int | None = None, + dataloader_num_workers: int = 0, min_tokens_per_module: float | None = None, moe_calibrate_all_experts: bool = True, quantization_aware_calibration: bool = True, @@ -329,6 +330,9 @@ def oneshot( :param streaming: True to stream data from a cloud dataset. :param overwrite_cache: Whether to overwrite the cached preprocessed datasets. :param preprocessing_num_workers: Number of processes for dataset preprocessing. + :param dataloader_num_workers: Number of worker processes for data loading. Set to 0 + to disable multiprocessing. Note: Custom data collators may not work with + multiprocessing. Default is 0. :param min_tokens_per_module: Minimum percentage of tokens per module, relevant for MoE models. :param moe_calibrate_all_experts: Whether to calibrate all experts during MoE diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py index b647c6824..d6789f254 100644 --- a/src/llmcompressor/pipelines/cache.py +++ b/src/llmcompressor/pipelines/cache.py @@ -69,6 +69,12 @@ def from_dataloader( """ Initialize a cache with data from the provided dataloader + This method iterates through all batches in the dataloader and offloads + them to the specified device. For faster cache preparation, consider: + - Increasing batch_size to reduce the number of iterations + - Using num_workers > 0 in the DataLoader for parallel loading + - Ensuring data preprocessing is done before creating the dataloader + :param dataloader: dataloader which generates values to be cached :param model_device: device which values will be onloaded to when fetched :param offload_device: device to offload values to @@ -234,7 +240,7 @@ def _offload_value( match value: case torch.Tensor(): return IntermediateValue( - value=value.to(device=offload_device), + value=value.to(device=offload_device) if offload_device is not None else value, device=(onload_device if onload_device else value.device), ) case list():