Skip to content

Qualcomm AI Engine Direct - Meta CI for Mobilebert , W2L, and Llama #8616

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .ci/scripts/test_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ test_model_with_qnn() {
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
export PYTHONPATH=$EXECUTORCH_ROOT/..

EXTRA_FLAGS=""
if [[ "${MODEL_NAME}" == "dl3" ]]; then
EXPORT_SCRIPT=deeplab_v3
elif [[ "${MODEL_NAME}" == "mv3" ]]; then
Expand All @@ -176,6 +177,12 @@ test_model_with_qnn() {
EXPORT_SCRIPT=inception_v3
elif [[ "${MODEL_NAME}" == "vit" ]]; then
EXPORT_SCRIPT=torchvision_vit
elif [[ "${MODEL_NAME}" == "mb" ]]; then
EXPORT_SCRIPT=mobilebert_fine_tune
EXTRA_FLAGS="--num_epochs 1"
pip install scikit-learn
elif [[ "${MODEL_NAME}" == "w2l" ]]; then
EXPORT_SCRIPT=wav2letter
elif [[ "${MODEL_NAME}" == "edsr" ]]; then
EXPORT_SCRIPT=edsr
# Additional deps for edsr
Expand All @@ -189,7 +196,7 @@ test_model_with_qnn() {
# TODO(guangyang): Make QNN chipset matches the target device
QNN_CHIPSET=SM8450

"${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
"${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS
EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ jobs:
strategy:
matrix:
dtype: [fp32]
model: [dl3, mv3, mv2, ic4, ic3, vit]
model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l]
fail-fast: false
with:
runner: linux.2xlarge
Expand Down
22 changes: 10 additions & 12 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
from executorch.examples.models.mobilenet_v3 import MV3Model
from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel

# from executorch.examples.models.wav2letter import Wav2LetterModel
from executorch.examples.models.wav2letter import Wav2LetterModel
from executorch.exir import to_edge
from executorch.exir.backend.backend_api import disable_validation
from executorch.exir.passes import PassManager
Expand Down Expand Up @@ -907,8 +907,7 @@ def test_qnn_backend_example_models(self):
# Fail during lowering Reopen once resolved
# MobileBertModelExample(),
# TorchVisionViTModel(),
# Encountered undefined symbol in mainline. Reopen once resolved.
# Wav2LetterModel(),
Wav2LetterModel(),
]
expected_partitions = [
1,
Expand All @@ -917,8 +916,8 @@ def test_qnn_backend_example_models(self):
1,
1,
1,
1,
1,
# 1,
# 1,
1,
]
# TODO: Due to trigger maximum recursion depth exceeded, need to check it.
Expand Down Expand Up @@ -1962,12 +1961,11 @@ def test_qnn_backend_example_models(self):
QCOM_ANNOTATION: (),
QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
},
# Encountered undefined symbol in mainline. Reopen once resolved.
# {
# QCOM_MODULE: Wav2LetterModel(),
# QCOM_ANNOTATION: (),
# QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
# },
{
QCOM_MODULE: Wav2LetterModel(),
QCOM_ANNOTATION: (),
QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
},
]
expected_partitions = [
1,
Expand All @@ -1979,7 +1977,7 @@ def test_qnn_backend_example_models(self):
# For MobileBertModelExample
# 1,
1,
# 1, For Wav2LetterModel
1,
]
# TODO: Due to trigger maximum recursion depth exceeded, need to check it.
disable_validation()
Expand Down
5 changes: 4 additions & 1 deletion examples/qualcomm/oss_scripts/llama/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,7 @@ def post_process():
)

runner_cmd = ""
performance_output_path = "outputs/inference_speed.txt"
if args.enable_x86_64:
# x86 emulator is intended for CI and not performance. Check only the first few tokens.
seq_len = min(seq_len, 16)
Expand All @@ -862,6 +863,7 @@ def post_process():
f"--model_path {pte_path}",
f"--seq_len {seq_len}",
f"--output_path {args.artifact}/outputs/outputs.txt",
f"--performance_output_path {performance_output_path}",
f"--kv_updater ShiftPointer",
runner_args,
]
Expand All @@ -882,6 +884,7 @@ def post_process():
f"--model_path {pte_filename}.pte",
f"--seq_len {seq_len}",
"--output_path outputs/outputs.txt",
f"--performance_output_path {performance_output_path}",
f"--kv_updater {'SmartMask' if args.kv_updater == smart_mask_updater else 'ShiftPointer'}",
runner_args,
]
Expand All @@ -905,7 +908,7 @@ def post_process():
adb.pull(output_path=args.artifact, callback=post_process)
if args.ip and args.port != -1:
inference_speed = 0
with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f:
with open(f"{args.artifact}/{performance_output_path}", "r") as f:
inference_speed = float(f.read())

pte_size = os.path.getsize(pte_path)
Expand Down
5 changes: 5 additions & 0 deletions examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ DEFINE_string(
output_path,
"outputs.txt",
"Executorch inference data output path.");
DEFINE_string(
performance_output_path,
"inference_speed.txt",
"Records inference speed. For CI purpose.");
DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
DEFINE_string(
Expand Down Expand Up @@ -63,6 +67,7 @@ int main(int argc, char** argv) {
example::Runner runner(
{FLAGS_model_path},
FLAGS_tokenizer_path.c_str(),
FLAGS_performance_output_path.c_str(),
FLAGS_logits_scale,
FLAGS_logits_offset,
FLAGS_temperature,
Expand Down
15 changes: 11 additions & 4 deletions examples/qualcomm/oss_scripts/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ namespace example {

namespace {
static constexpr auto kTopp = 0.9f;
void printReport(const Runner::Stats& stats);
void printReport(
const Runner::Stats& stats,
const std::string& performance_output_path);
std::string statsToJsonString(const Runner::Stats& stats);
} // namespace

Runner::Runner(
const std::vector<std::string>& models_path,
const std::string& tokenizer_path,
const std::string& performance_output_path,
const float logits_scale,
const int32_t logits_offset,
const float temperature,
Expand All @@ -49,6 +52,7 @@ Runner::Runner(
: n_bos_(1),
n_eos_(1),
tokenizer_path_(tokenizer_path),
performance_output_path_(performance_output_path),
logits_scale_(logits_scale),
logits_offset_(logits_offset),
temperature_(temperature),
Expand Down Expand Up @@ -437,7 +441,7 @@ Error Runner::generate(

stats_.num_prompt_tokens = num_prompt_tokens;
stats_.num_generated_tokens = pos - num_prompt_tokens;
printReport(stats_);
printReport(stats_, performance_output_path_);
if (stats_callback) {
stats_callback(stats_);
}
Expand All @@ -446,7 +450,9 @@ Error Runner::generate(
}

namespace {
void printReport(const Runner::Stats& stats) {
void printReport(
const Runner::Stats& stats,
const std::string& performance_output_path) {
printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str());

ET_LOG(
Expand Down Expand Up @@ -507,7 +513,8 @@ void printReport(const Runner::Stats& stats) {

// For now, we just print the total inference time for CI, can save more info
// in future if needed.
std::ofstream outfile("outputs/inference_speed.txt");

std::ofstream outfile(performance_output_path.c_str());
if (outfile.is_open()) {
double num_tok = (stats.num_generated_tokens) /
(double)(stats.inference_end_ms - stats.inference_start_ms) *
Expand Down
2 changes: 2 additions & 0 deletions examples/qualcomm/oss_scripts/llama/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class Runner {
explicit Runner(
const std::vector<std::string>& models_path,
const std::string& tokenizer_path,
const std::string& performance_output_path_,
const float logits_scale,
const int32_t logits_offset,
const float temperature,
Expand Down Expand Up @@ -101,6 +102,7 @@ class Runner {
const int32_t n_eos_;
std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
std::string tokenizer_path_;
std::string performance_output_path_;
float logits_scale_;
int32_t logits_offset_;
float temperature_;
Expand Down
9 changes: 8 additions & 1 deletion examples/qualcomm/scripts/mobilebert_fine_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

epochs = 5
epochs = args.num_epochs
dataloader_train = DataLoader(
dataset_train,
sampler=RandomSampler(dataset_train),
Expand Down Expand Up @@ -366,6 +366,13 @@ def calibrator(gm):
type=str,
)

parser.add_argument(
"--num_epochs",
help="If no pretrained weights are provided, set number of epochs to train the model",
default=5,
type=int,
)

parser.add_argument(
"-F",
"--use_fp16",
Expand Down
22 changes: 17 additions & 5 deletions examples/qualcomm/scripts/wav2letter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# LICENSE file in the root directory of this source tree.

import json
import logging
import os
import sys
from multiprocessing.connection import Client
Expand Down Expand Up @@ -111,7 +112,12 @@ def main(args):
# target labels " abcdefghijklmnopqrstuvwxyz'*"
instance.vocab_size = 29
model = instance.get_eager_model().eval()
model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True))
if args.pretrained_weight:
model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True))
else:
logging.warning(
"It is strongly recommended to provide pretrained weights, otherwise accuracy will be bad. This option is here mainly for CI purpose to ensure compile is successful."
)

# convert conv1d to conv2d in nn.Module level will only introduce 2 permute
# nodes around input & output, which is more quantization friendly.
Expand All @@ -128,9 +134,15 @@ def main(args):

# retrieve dataset, will take some time to download
data_num = 100
inputs, targets, input_list = get_dataset(
data_size=data_num, artifact_dir=args.artifact
)
if args.compile_only:
inputs = [(torch.rand(1, 1, 700, 1),)]
logging.warning(
"With compile_only, accuracy will be bad due to insufficient datasets for quantization."
)
else:
inputs, targets, input_list = get_dataset(
data_size=data_num, artifact_dir=args.artifact
)
pte_filename = "w2l_qnn"
build_executorch_binary(
model,
Expand Down Expand Up @@ -212,7 +224,7 @@ def main(args):
),
default=None,
type=str,
required=True,
required=False,
)

args = parser.parse_args()
Expand Down
Loading