Fix quantized embedding export logic (#3095)

larryliu0820 · facebook-github-bot · commit 2c467dde4ab8 · 2024-04-19T01:34:00.000-07:00
Summary: Add patches to make 4bit quantized embedding work for export. Fixed: * Schema mismatch between functional embedding_4bit and out variant * Set `packed=True` for 4bit quantization Pull Request resolved: #3095 Reviewed By: mikekgfb Differential Revision: D56340670 Pulled By: larryliu0820 fbshipit-source-id: c98623a9b7633fc5a6c390be1557213c719fa95a
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -614,7 +614,10 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
         bitwidth = int(bitwidth)
         transforms.append(
             lambda model: EmbeddingQuantHandler(
-                model, bitwidth=bitwidth, group_size=group_size
+                model,
+                bitwidth=bitwidth,
+                group_size=group_size,
+                packed=(bitwidth == 4),
             ).quantized_model()
         )
 
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -189,7 +189,7 @@ def embedding_byte_dtype_out_meta(
 
 quantized_decomposed_lib.define(
     "embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
-    "int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)",
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,7 @@ def embedding_byte_dtype_out_meta(`
`189`	`189`
`190`	`190`	`quantized_decomposed_lib.define(`
`191`	`191`	`"embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "`
`192`		`- "int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)",`
	`192`	`+ "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",`
`193`	`193`	`)`
`194`	`194`
`195`	`195`