Skip to content

Qualcomm AI Engine Direct - [DO NOT MERGE] PTE size and Inference Speed Verification #7569

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/qualcomm/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,7 @@ def generate_multi_graph_program(
)
assert qnn_mgr.Init().value == 0, "failed to load processed bytes"
binary_info = bytes(qnn_mgr.Compile())
print("Checking the size of QNN binary info: ", len(binary_info))
assert len(binary_info) != 0, "failed to generate QNN context binary"
graph_names = qnn_mgr.GetGraphNames()
for graph_name in graph_names:
Expand Down
28 changes: 27 additions & 1 deletion examples/qualcomm/oss_scripts/llama/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,8 @@ def compile(args, pte_filename, tokenizer):
annotate_linear_16a8w_in_affine_layer,
)
if args.ptq != None:
kv_quant_attrs = {}
import hashlib
kv_quant_attrs, parameter_hash = {}, []
for i, llama_instance in enumerate(llama_instance_list):
llama_instance.quantize(
quant_dtype=quant_dtype,
Expand All @@ -517,6 +518,31 @@ def compile(args, pte_filename, tokenizer):
kv_quant_attrs=kv_quant_attrs,
),
)

tensor_to_md5 = {}
for name, buffer in llama_instance.llama_model.named_buffers():
md5_buffer = hashlib.md5(buffer.numpy().tobytes()).hexdigest()
if md5_buffer in tensor_to_md5:
tensor_to_md5[md5_buffer].append(name)
else:
tensor_to_md5[md5_buffer] = [name]
parameter_hash.append(tensor_to_md5)

# check tensors in prefill & decode are exactly the same
assert len(parameter_hash[0]) == len(parameter_hash[1])
num_keys = len(parameter_hash[0])
# Remove common keys from both dictionaries
for key in set(parameter_hash[0]).intersection(set(parameter_hash[1])):
del parameter_hash[0][key]
del parameter_hash[1][key]
print(f"{num_keys - len(parameter_hash[0])} / {num_keys} tensors are matched")

for buf, name in parameter_hash[0].items(): # kv
print(f"KV buffers: {name} cannot find a match")
for buf, name in parameter_hash[1].items(): # prefill
print(f"Prefill buffers: {name} cannot find a match")


end_quantize_ts = time.time()
logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}")

Expand Down
8 changes: 4 additions & 4 deletions examples/qualcomm/oss_scripts/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,10 +404,10 @@ Error Runner::generate(
token_callback(piece_res.get().c_str());
}

if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {
ET_LOG(Info, "\nReached to the end of generation");
break;
}
// if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {
// ET_LOG(Info, "\nReached to the end of generation");
// break;
// }
}
};

Expand Down
2 changes: 2 additions & 0 deletions examples/qualcomm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ def execute(self, custom_runner_cmd=None, method_index=0):
)
else:
qnn_executor_runner_cmds = custom_runner_cmd

print("Execution command is: ", qnn_executor_runner_cmds)

self._adb(["shell", f"{qnn_executor_runner_cmds}"])

Expand Down
Loading