Compute the Hadamard transform via an int4 matrix multiplication.

gonnet · copybara-github · commit ee1ef41ce5fb · 2026-02-10T09:44:16.000-08:00
PiperOrigin-RevId: 868190167
diff --git a/ai_edge_quantizer/transformations/insert_decomposed_hadamard_rotation.py b/ai_edge_quantizer/transformations/insert_decomposed_hadamard_rotation.py
@@ -88,7 +88,7 @@ def _update_fully_connected_consumers(
 
 
 def _make_hadamard_matrix(size: int):
-  """Generates a Hadamard matrix of the given size.
+  """Generates an unnormalized integer Hadamard matrix of the given size.
 
   Args:
     size: The size of the Hadamard matrix. Must be a power of 2. This represents
@@ -103,12 +103,12 @@ def _make_hadamard_matrix(size: int):
   """
   if size <= 0 or (size & (size - 1)) != 0:
     raise ValueError('Hadamard matrix size must be a power of 2. ')
-  h = h2 = np.array([[1, 1], [1, -1]])
+  h = h2 = np.array([[1, 1], [1, -1]], dtype=np.int8)
   current_size = 2
   while current_size < size:
     h = np.kron(h, h2)
     current_size *= 2
-  return h / np.sqrt(size)
+  return h
 
 
 def insert_decomposed_hadamard_rotation(
@@ -191,16 +191,23 @@ def insert_decomposed_hadamard_rotation(
   prerorate_reshape_op.outputs = [prerotate_reshape_output_tensor_id]
 
   # Generate hadamard_matrix(hadamard_size).
-  # We could quantize this to INT4 for better memory efficiency, but for large
-  # models the memory overhead is not significant, and floating point
+  # We quantize the Hadamard matrix to INT4 for better memory efficiency, but
+  # for large models the memory overhead is not significant, and floating point
   # computation does seem to result in better accuracy.
   hadamard_matrix = _make_hadamard_matrix(hadamard_size)
   hadamard_matrix_tensor_id = transformation_utils.add_new_constant_tensor(
-      tensor.name + b'_hadamard_matrix',
-      hadamard_matrix.astype(np.float32),
-      schema_py_generated.TensorType.FLOAT32,
-      transformation_input.subgraph,
-      transformation_input.buffers,
+      tensor_name=tensor.name + b'_hadamard_matrix',
+      data=transformation_utils.pack_data(
+          bitwidth=4, flattened_data=hadamard_matrix.flatten()
+      ),
+      tensor_type=schema_py_generated.TensorType.INT4,
+      subgraph=transformation_input.subgraph,
+      buffers=transformation_input.buffers,
+      tensor_shape=hadamard_matrix.shape,
+      quantization=schema_py_generated.QuantizationParametersT(
+          scale=np.array([1.0 / np.sqrt(hadamard_size)], dtype=np.float32),
+          zeroPoint=[0],
+      ),
   )
 
   # Insert x' = tfl.fully_connected(x', hadamard_matrix)
diff --git a/ai_edge_quantizer/transformations/transformation_utils.py b/ai_edge_quantizer/transformations/transformation_utils.py
@@ -140,6 +140,7 @@ def add_new_constant_tensor(
     buffers: list[schema_py_generated.BufferT],
     tensor_shape: Optional[list[int]] = None,
     force_duplicate_buffer: bool = False,
+    quantization: schema_py_generated.QuantizationParametersT | None = None,
 ) -> int:
   """Add a new constant tensor to the model.
 
@@ -153,6 +154,8 @@ def add_new_constant_tensor(
       data will be used.
     force_duplicate_buffer: Whether to add a new buffer even if the same buffer
       already exists.
+    quantization: Optional `QuantizationParametersT` describing the quantization
+      of this tensor.
 
   Returns:
     The index of the new tensor in the subgraph.
@@ -166,6 +169,7 @@ def add_new_constant_tensor(
   new_tensor.buffer = new_buffer_id
   new_tensor.type = tensor_type
   new_tensor.name = tensor_name
+  new_tensor.quantization = quantization
   new_tensor_id = len(subgraph.tensors)
   subgraph.tensors.append(new_tensor)
   return new_tensor_id
@@ -176,6 +180,7 @@ def add_new_activation_tensor(
     shape: list[int],
     tensor_type: schema_py_generated.TensorType,
     subgraph: schema_py_generated.SubGraphT,
+    quantization: schema_py_generated.QuantizationParametersT | None = None,
 ) -> int:
   """Add a new activation tensor to the model.
 
@@ -184,6 +189,8 @@ def add_new_activation_tensor(
     shape: The shape of the new tensor.
     tensor_type: The type of the new tensor.
     subgraph: The subgraph where the new tensor is added.
+    quantization: Optional `QuantizationParametersT` describing the quantization
+      of this tensor.
 
   Returns:
     The index of the new tensor in the subgraph.
@@ -199,6 +206,7 @@ def add_new_activation_tensor(
     new_tensor.shape = shape
   new_tensor.type = tensor_type
   new_tensor.name = tensor_name
+  new_tensor.quantization = quantization
   new_tensor.buffer = 0
   new_tensor_id = len(subgraph.tensors)
   subgraph.tensors.append(new_tensor)
@@ -226,8 +234,9 @@ def pack_data(bitwidth: int, flattened_data: np.ndarray) -> np.ndarray:
     Packed data.
   """
   if bitwidth == 4:
-    even_data = flattened_data[::2] & 0x0F
-    odd_data = np.left_shift(flattened_data[1::2], 4).astype(np.uint8)
+    flattened_data = np.bitwise_and(flattened_data.astype(np.uint8), 0x0F)
+    even_data = flattened_data[::2]
+    odd_data = np.left_shift(flattened_data[1::2], 4)
     if odd_data.shape[0] == even_data.shape[0] - 1:
       odd_data = np.pad(odd_data, (0, 1), constant_values=0)
     return np.bitwise_or(even_data, odd_data)