pytorch · ebsmothers · Dec 6, 2024 · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -30,6 +30,8 @@
 from torchtune.datasets import ConcatDataset
 from torchtune.recipe_interfaces import FTRecipeInterface
 from torchtune.utils.activations import apply_selective_activation_checkpointing
+from torchtune.utils.early_exit import early_exit_loss, build_early_exit_curriculum
+from torchtune.modules.common_utils import slice_str_to_array
 
 from tqdm import tqdm
 
@@ -135,6 +137,18 @@ def __init__(self, cfg: DictConfig) -> None:
         self.max_steps_per_epoch = cfg.max_steps_per_epoch
         self.global_step = 0
 
+        cfg_early_exit = cfg.get("early_exit", None)
+        # TODO: create a "setup" function similar to setup_model?
+        # TODO: rename "early_exit" to "early_exit_loss"
+        if cfg_early_exit:
+            self.early_exit_layers = cfg_early_exit.get("layers", ":")
+            self.early_exit_curriculum = cfg_early_exit.get("curriculum", "none")
+            self.early_exit_scale = cfg_early_exit.get("scale", 1.0)
+        else:
+            self.early_exit_layers = None
+            self.early_exit_curriculum = None
+            self.early_exit_scale = None
+
     def load_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
         """
         Extract the checkpoint state from file and validate. If resume_from_checkpoint
@@ -485,6 +499,18 @@ def train(self) -> None:
         running_loss = 0
         num_tokens = 0
 
+        # Early exit loss settings
+        # TODO: move to _init_() or setup()
+        if self.early_exit_layers:
+            output_hidden_states = slice_str_to_array(self.early_exit_layers, len(self._model.layers))
+            if True: # TODO: add cli option
+                output_hidden_states[len(self._model.layers) - 1] = True
+        else:
+            output_hidden_states = False
+
+        if self.early_exit_curriculum:
+            self.early_exit_curriculum = build_early_exit_curriculum(self.early_exit_curriculum, output_hidden_states, self.total_epochs*self._steps_per_epoch)
+
         # self.epochs_run should be non-zero when we're resuming from a checkpoint
         for curr_epoch in range(self.epochs_run, self.total_epochs):
 
@@ -503,6 +529,7 @@ def train(self) -> None:
 
                 # Both are shape [b, s]
                 tokens, labels = batch["tokens"], batch["labels"]
+                b, s = tokens.shape
                 # Get the attention mask and position ids from the dataset if they
                 # exist. Currently, only sample packing in PackedDataset returns these
                 mask = batch.get("mask", None)  # shape [b, s, s]
@@ -516,13 +543,21 @@ def train(self) -> None:
                     input_pos.to(self._device) if input_pos is not None else None
                 )
 
-                logits = self._model(tokens, mask=mask, input_pos=input_pos)
+                if self.early_exit_layers:
+                    logits, hidden_states = self._model(tokens, mask=mask, input_pos=input_pos, output_hidden_states=output_hidden_states)
+                else:
+                    logits = self._model(tokens, mask=mask, input_pos=input_pos, output_hidden_states=output_hidden_states)
+
                 # Shift so that tokens < n predict n
                 logits = logits[..., :-1, :].contiguous()
                 labels = labels[..., 1:].contiguous()
                 logits = logits.transpose(1, 2)
+
                 # Compute loss
-                loss = self._loss_fn(logits, labels)
+                if self.early_exit_layers:
+                    loss = early_exit_loss(self._model, hidden_states, labels, self._loss_fn)
+                else:
+                    loss = self._loss_fn(logits, labels)
 
                 loss = loss / self._gradient_accumulation_steps
                 running_loss += loss
@@ -565,6 +600,12 @@ def train(self) -> None:
                     num_tokens = 0
                     t0 = time.perf_counter()
 
+                if self.early_exit_curriculum:
+                    self.early_exit_curriculum.step()
+                    output_hidden_states = self.early_exit_curriculum.get()
+                    if True: # TODO: add cli option
+                        output_hidden_states[len(self._model.layers) - 1] = True
+
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
 

diff --git a/torchtune/datasets/_text_completion.py b/torchtune/datasets/_text_completion.py
@@ -36,18 +36,28 @@ def __init__(
         source: str,
         column: str = "text",
         max_seq_len: Optional[int] = None,
+        num_samples: Optional[int] = None,
         **load_dataset_kwargs: Dict[str, Any],
     ) -> None:
         self._tokenizer = tokenizer
         self._data = load_dataset(source, **load_dataset_kwargs)
         self.max_seq_len = max_seq_len
         self._column = column
+        self._num_samples = num_samples
+        self._streaming = load_dataset_kwargs["streaming"] if "streaming" in load_dataset_kwargs else False
+        self._data_itr = iter(self._data) if self._streaming else None
 
     def __len__(self):
-        return len(self._data)
+        if self._num_samples is None or not self._streaming:
+            return len(self._data)
+        else:
+            return self._num_samples
 
     def __getitem__(self, index: int) -> Dict[str, List[int]]:
-        sample = self._data[index]
+        if self._streaming:
+            sample = next(self._data_itr)
+        else:
+            sample = self._data[index]
         return self._prepare_sample(sample)
 
     def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, List[int]]:

diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from .attention import CausalSelfAttention  # noqa
+from .layer_dropout import LayerDropout, create_layer_dropout_modules # noqa
 from .common_utils import reparametrize_as_dtype_state_dict_post_hook
 from .feed_forward import FeedForward  # noqa
 from .kv_cache import KVCache  # noqa
@@ -24,4 +25,6 @@
     "TransformerDecoderLayer",
     "TransformerClassifier",
     "reparametrize_as_dtype_state_dict_post_hook",
+    "LayerDropout",
+    "create_layer_dropout_modules",
 ]
diff --git a/torchtune/modules/common_utils.py b/torchtune/modules/common_utils.py
@@ -48,3 +48,30 @@ def reparametrize_as_dtype_state_dict_post_hook(
             state_dict[k] = v.to(dtype)
             if offload_to_cpu:
                 state_dict[k] = state_dict[k].cpu()
+
+def slice_str_to_array(slice_str, length):
+    # Parse the slice string
+    parts = slice_str.split(':')
+    start, end, step = None, None, None
+
+    if len(parts) == 1 and parts[0] != '':
+        start = int(parts[0])
+    elif len(parts) == 2:
+        start = int(parts[0]) if parts[0] != '' else None
+        end = int(parts[1]) if parts[1] != '' else None
+    elif len(parts) == 3:
+        start = int(parts[0]) if parts[0] != '' else None
+        end = int(parts[1]) if parts[1] != '' else None
+        step = int(parts[2]) if parts[2] != '' else None
+
+    # Create a boolean array based on the slice
+    result = [False] * length
+    slice_indices = range(start if start is not None else 0,
+                          end if end is not None else length,
+                          step if step is not None else 1)
+
+    for i in slice_indices:
+        if 0 <= i < length:
+            result[i] = True
+
+    return result
diff --git a/torchtune/modules/layer_dropout.py b/torchtune/modules/layer_dropout.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Callable, Optional
+import math
+import torch
+
+from torchtune.modules.common_utils import slice_str_to_array
+
+class LayerDropout(torch.nn.Module):
+    def __init__(self, prob=0.0, dim=0, disable_on_eval=True, seed=None):
+        super().__init__()
+        self.prob: float = prob
+        self.dim = dim
+        self.disable_on_eval: bool = disable_on_eval
+        self.generator = torch.Generator(device="cpu")
+        self.inferred: float = None
+
+        if seed is not None:
+            self.generator.manual_seed(seed)
+
+    def forward(self, function: Callable, input: torch.Tensor, *args, **kwargs):
+        n = input.shape[self.dim]
+
+        if self.prob == 0 or (self.disable_on_eval and self.training is False):
+            self.inferred = 1.0
+            return function(input, *args, **kwargs)
+
+        skip = torch.bernoulli(torch.Tensor((n) * [self.prob]), generator=self.generator).to(input.device).to(input.dtype)
+        self.inferred = 1 - torch.mean(skip)
+        ind_selected = (skip == 0).nonzero().squeeze()
+
+        if ind_selected.numel() > 0:
+            x_selected = torch.index_select(input, self.dim, ind_selected)
+            out_selected = function(x_selected, *args, **kwargs)
+
+        out = input.clone()
+        assert self.dim == 0, "Currently only supporting dropping elements along the 0th dimension"
+        if ind_selected.numel() > 0:
+            out[ind_selected] = out_selected
+        return out
+
+class ScaleType(str, Enum):
+    UNIFORM = "uniform"
+    EXP = "exp"
+    LINEAR = "linear"
+    LOG = "log"
+    SIN = "sin"
+    SIGMOID = "sigmoid"
+    STEP = "step"
+
+def get_scale(scale_type: ScaleType, scale_period: int, val: int):
+    if scale_period == 0:
+        return 1
+
+    # all the equations below aim to make scale = 0 when val=0, and scale = 1 when val=scale_period
+    return {
+        ScaleType.UNIFORM: 1,
+        ScaleType.EXP: math.exp(val * math.log(2) / scale_period) - 1,
+        ScaleType.LINEAR: val / scale_period,
+        ScaleType.LOG: math.log(val + 1) / math.log(scale_period + 1),
+        ScaleType.SIN: math.sin(0.5 * math.pi * val / scale_period),
+        ScaleType.SIGMOID: 1 / (1 + math.exp(-10 * (val / scale_period - 0.5))),
+    }[scale_type]
+
+def create_layer_dropout_modules(num_layers: int, prob_max: float= 0.0, prob_layer_scale: ScaleType = ScaleType.EXP, layers_str: Optional[str] = None, disable_on_eval: bool = True):
+    layer_dropouts = torch.nn.ModuleList()
+    has_dropout = slice_str_to_array(layers_str, num_layers) if layers_str else [True] * num_layers
+
+    for layer_id in range(num_layers):
+        prob = prob_max * get_scale(
+            scale_type = prob_layer_scale,
+            scale_period = num_layers - 1,
+            val = layer_id,
+        ) if has_dropout[layer_id] else 0.0
+        assert prob >= 0.0 and prob <= prob_max, f"prob={prob} should be between 0 and {prob_max}"
+        # We would like each layer to have a different seed, so that we don't have the same samples skipped across layers. Hence, we use the layer_id as a seed for each layer's dropout.
+        layer_dropout = LayerDropout(prob, disable_on_eval=disable_on_eval, seed=layer_id)
+        layer_dropouts.append(layer_dropout)
+
+    return layer_dropouts
diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py
@@ -4,12 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import copy
-from typing import Optional
+from collections import OrderedDict
+from typing import List, Optional, Union
 
 import torch
 from torch import nn, Tensor
 
 from torchtune.modules import CausalSelfAttention, KVCache
+from torchtune.modules import LayerDropout, create_layer_dropout_modules
 
 
 class TransformerDecoderLayer(nn.Module):
@@ -121,6 +123,8 @@ class TransformerDecoder(nn.Module):
             before final MLP.
         output (nn.Linear): Callable that applies a linear transformation to the output of
             the decoder.
+        layer_dropout_prob (float): Probability of skipping samples in the transformer
+            layer.
 
     Note:
         Arg values are checked for correctness (eg: ``attn_dropout`` belongs to [0,1])
@@ -138,6 +142,9 @@ def __init__(
         head_dim: int,
         norm: nn.Module,
         output: nn.Linear,
+        layer_dropout_prob: float = 0.0,
+        layer_dropout_prob_layer_scale: str = "exp",
+        layer_dropout_str: str = ":",
     ) -> None:
         super().__init__()
 
@@ -150,6 +157,8 @@ def __init__(
         self.head_dim = head_dim
         self.causal_mask = None
 
+        self.layer_dropouts = create_layer_dropout_modules(num_layers, layer_dropout_prob, layer_dropout_prob_layer_scale, layer_dropout_str)
+
     def setup_caches(self, batch_size: int, dtype: torch.dtype) -> None:
         """Setup key value caches for attention calculation.
 
@@ -188,6 +197,7 @@ def forward(
         *,
         mask: Optional[Tensor] = None,
         input_pos: Optional[Tensor] = None,
+        output_hidden_states: Union[bool, List[bool]] = False,
     ) -> Tensor:
         """
         Args:
@@ -227,6 +237,9 @@ def forward(
         # shape: [b, s, d]
         h = self.tok_embeddings(tokens)
 
+        if isinstance(output_hidden_states, bool):
+            output_hidden_states = [output_hidden_states] * len(self.layers)
+
         if self.causal_mask is not None:
             if input_pos is None:
                 raise ValueError(
@@ -240,13 +253,22 @@ def forward(
             # in most cases input_pos_len should be 1
             mask = self.causal_mask[None, input_pos]
 
-        for layer in self.layers:
+        if any(output_hidden_states):
+            hidden_states = OrderedDict() # TODO: use tensordict?
+
+        for i, layer in enumerate(self.layers):
             # shape: [b, s, d]
-            h = layer(h, mask=mask, input_pos=input_pos)
+            h = self.layer_dropouts[i](layer, h, mask=mask, input_pos=input_pos)
+            if output_hidden_states[i]:
+                hidden_states[i] = h
 
         # shape: [b, s, d]
         h = self.norm(h)
 
         # shape: [b, s, out_dim] - out_dim is usually the vocab size
         output = self.output(h).float()
-        return output
+
+        if any(output_hidden_states):
+            return output, hidden_states
+        else:
+            return output