diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index 6f3c16fcf..7d7e7611f 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -247,6 +247,14 @@ class DatasetArguments(CustomDatasetArguments):
             "Default is set to True."
         },
     )
+    dataloader_num_workers: int = field(
+        default=0,
+        metadata={
+            "help": "Number of worker processes for data loading. Set to 0 to disable "
+            "multiprocessing. Note: Custom data collators may not work with "
+            "multiprocessing. Default is 0."
+        },
+    )
 
     def is_dataset_provided(self) -> bool:
         return self.dataset is not None or self.dataset_path is not None
diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py
index 0d5fceca8..269c45ca7 100644
--- a/src/llmcompressor/datasets/utils.py
+++ b/src/llmcompressor/datasets/utils.py
@@ -131,6 +131,7 @@ def format_calibration_data(
         sampler=_make_sampler(args, tokenized_dataset),
         collate_fn=_make_collate_fn(args, processor),
         pin_memory=False,
+        num_workers=args.dataloader_num_workers,
     )
 
 
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index cd0cf6628..ec5ada3ca 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -261,6 +261,7 @@ def oneshot(
     streaming: bool = False,
     overwrite_cache: bool = False,
     preprocessing_num_workers: int | None = None,
+    dataloader_num_workers: int = 0,
     min_tokens_per_module: float | None = None,
     moe_calibrate_all_experts: bool = True,
     quantization_aware_calibration: bool = True,
@@ -329,6 +330,9 @@ def oneshot(
     :param streaming: True to stream data from a cloud dataset.
     :param overwrite_cache: Whether to overwrite the cached preprocessed datasets.
     :param preprocessing_num_workers: Number of processes for dataset preprocessing.
+    :param dataloader_num_workers: Number of worker processes for data loading. Set to 0
+        to disable multiprocessing. Note: Custom data collators may not work with
+        multiprocessing. Default is 0.
     :param min_tokens_per_module: Minimum percentage of tokens per
         module, relevant for MoE models.
     :param moe_calibrate_all_experts: Whether to calibrate all experts during MoE
diff --git a/src/llmcompressor/pipelines/cache.py b/src/llmcompressor/pipelines/cache.py
index b647c6824..d6789f254 100644
--- a/src/llmcompressor/pipelines/cache.py
+++ b/src/llmcompressor/pipelines/cache.py
@@ -69,6 +69,12 @@ def from_dataloader(
         """
         Initialize a cache with data from the provided dataloader
 
+        This method iterates through all batches in the dataloader and offloads
+        them to the specified device. For faster cache preparation, consider:
+        - Increasing batch_size to reduce the number of iterations
+        - Using num_workers > 0 in the DataLoader for parallel loading
+        - Ensuring data preprocessing is done before creating the dataloader
+
         :param dataloader: dataloader which generates values to be cached
         :param model_device: device which values will be onloaded to when fetched
         :param offload_device: device to offload values to
@@ -234,7 +240,7 @@ def _offload_value(
         match value:
             case torch.Tensor():
                 return IntermediateValue(
-                    value=value.to(device=offload_device),
+                    value=value.to(device=offload_device) if offload_device is not None else value,
                     device=(onload_device if onload_device else value.device),
                 )
             case list():