pytorch · jainapurva · Sep 24, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/scripts/hf_eval.py b/scripts/hf_eval.py
@@ -89,7 +89,7 @@ def all_linear(mod, name):
     with torch.no_grad():
         result = evaluate(
             HFLM(
-                pretrained=model.to(device),
+                pretrained=model,
                 tokenizer=tokenizer,
                 batch_size=batch_size,
                 max_length=max_length),

diff --git a/scripts/prepare.sh b/scripts/prepare.sh
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -335,8 +335,8 @@ def from_hp_to_floatx(
         input_float: torch.Tensor,
         block_size: Tuple[int, ...],
         target_dtype: torch.dtype,
-        scale_dtype: Optional[torch.dtype],
         layout_type: LayoutType,
+        scale_dtype: Optional[torch.dtype] = None,
     ):
 
         if target_dtype in FP8_TYPES:

diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
@@ -69,7 +69,10 @@ def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
             input = (
                 input.contiguous()
             )  # (it seems the transpose makes cublas check the above j constraint on i)
-        return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
+        try:
+            return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
+        except:
+            return torch.matmul(input.to(torch.float32), mat2.to(torch.float32)).to(torch.int32)
 else:
     def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
         """

diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -9,7 +9,7 @@
     Int8WeightOnlyQuantizedLinearWeight,
     QuantizedLinearWeightBase,
 )
-from torchao.dtypes import AffineQuantizedTensor, PlainLayoutType, TensorCoreTiledLayoutType
+from torchao.dtypes import AffineQuantizedTensor, PlainLayoutType, TensorCoreTiledLayoutType, Float8LayoutType
 from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
 from torch.utils._python_dispatch import return_and_correct_aliasing
 from .quant_primitives import (
@@ -477,6 +477,15 @@ def _quantized_linear_op(act_mat, w_qtensor, bias):
     def from_float(cls, weight):
         return weight
 
+class AQFloat8WeightOnlyQuantizedLinearWeight(AffineQuantizedTensor, AQMixin):
+    """
+    AutoQuantizable version of Float8WeightOnlyQuantizedLinearWeight
+    """
+    @classmethod
+    def from_float(cls, weight):
+        block_size = (1, weight.shape[1])
+        return super(AQFloat8WeightOnlyQuantizedLinearWeight, cls).from_hp_to_floatx(weight, block_size, target_dtype=torch.float8_e4m3fn, layout_type=Float8LayoutType())
+
 # here we don't include int4 quantization in since int8 tends to be a better apples to apples comparison
 DEFAULT_AUTOQUANT_CLASS_LIST = [
     AQFloatLinearWeight,
@@ -485,12 +494,13 @@ def from_float(cls, weight):
     # AQInt8WeightOnlyQuantizedLinearWeight3,
     # TODO this gets picked in places where it makes perf worse, why?
     AQInt8DynamicallyQuantizedLinearWeight,
+    AQFloat8WeightOnlyQuantizedLinearWeight,
 ]
 
 DEFAULT_INT4_AUTOQUANT_CLASS_LIST = [
     AQFloatLinearWeight,
     AQInt8DynamicallyQuantizedLinearWeight,
-    AQInt4G64WeightOnlyQuantizedLinearWeight
+    AQInt4G64WeightOnlyQuantizedLinearWeight,
 ]
 
 def _change_linears_to_autoquantizable(model, **kwargs):