diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 054ac02bc07..8143f9ea9a4 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -164,6 +164,7 @@ test_model_with_qnn() { export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ export PYTHONPATH=$EXECUTORCH_ROOT/.. + EXTRA_FLAGS="" if [[ "${MODEL_NAME}" == "dl3" ]]; then EXPORT_SCRIPT=deeplab_v3 elif [[ "${MODEL_NAME}" == "mv3" ]]; then @@ -176,6 +177,12 @@ test_model_with_qnn() { EXPORT_SCRIPT=inception_v3 elif [[ "${MODEL_NAME}" == "vit" ]]; then EXPORT_SCRIPT=torchvision_vit + elif [[ "${MODEL_NAME}" == "mb" ]]; then + EXPORT_SCRIPT=mobilebert_fine_tune + EXTRA_FLAGS="--num_epochs 1" + pip install scikit-learn + elif [[ "${MODEL_NAME}" == "w2l" ]]; then + EXPORT_SCRIPT=wav2letter elif [[ "${MODEL_NAME}" == "edsr" ]]; then EXPORT_SCRIPT=edsr # Additional deps for edsr @@ -189,7 +196,7 @@ test_model_with_qnn() { # TODO(guangyang): Make QNN chipset matches the target device QNN_CHIPSET=SM8450 - "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only + "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit) } diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 410e95d9a84..49fd08591a7 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -311,7 +311,7 @@ jobs: strategy: matrix: dtype: [fp32] - model: [dl3, mv3, mv2, ic4, ic3, vit] + model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l] fail-fast: false with: runner: linux.2xlarge diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index ad00d58fb85..986243d7a9c 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -73,7 +73,7 @@ from executorch.examples.models.mobilenet_v3 import MV3Model from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel -# from executorch.examples.models.wav2letter import Wav2LetterModel +from executorch.examples.models.wav2letter import Wav2LetterModel from executorch.exir import to_edge from executorch.exir.backend.backend_api import disable_validation from executorch.exir.passes import PassManager @@ -907,8 +907,7 @@ def test_qnn_backend_example_models(self): # Fail during lowering Reopen once resolved # MobileBertModelExample(), # TorchVisionViTModel(), - # Encountered undefined symbol in mainline. Reopen once resolved. - # Wav2LetterModel(), + Wav2LetterModel(), ] expected_partitions = [ 1, @@ -917,8 +916,8 @@ def test_qnn_backend_example_models(self): 1, 1, 1, - 1, - 1, + # 1, + # 1, 1, ] # TODO: Due to trigger maximum recursion depth exceeded, need to check it. @@ -1962,12 +1961,11 @@ def test_qnn_backend_example_models(self): QCOM_ANNOTATION: (), QCOM_QUANT_DTYPE: QuantDtype.use_8a8w, }, - # Encountered undefined symbol in mainline. Reopen once resolved. - # { - # QCOM_MODULE: Wav2LetterModel(), - # QCOM_ANNOTATION: (), - # QCOM_QUANT_DTYPE: QuantDtype.use_8a8w, - # }, + { + QCOM_MODULE: Wav2LetterModel(), + QCOM_ANNOTATION: (), + QCOM_QUANT_DTYPE: QuantDtype.use_8a8w, + }, ] expected_partitions = [ 1, @@ -1979,7 +1977,7 @@ def test_qnn_backend_example_models(self): # For MobileBertModelExample # 1, 1, - # 1, For Wav2LetterModel + 1, ] # TODO: Due to trigger maximum recursion depth exceeded, need to check it. disable_validation() diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 9cad2499730..0829d99d57a 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -843,6 +843,7 @@ def post_process(): ) runner_cmd = "" + performance_output_path = "outputs/inference_speed.txt" if args.enable_x86_64: # x86 emulator is intended for CI and not performance. Check only the first few tokens. seq_len = min(seq_len, 16) @@ -862,6 +863,7 @@ def post_process(): f"--model_path {pte_path}", f"--seq_len {seq_len}", f"--output_path {args.artifact}/outputs/outputs.txt", + f"--performance_output_path {performance_output_path}", f"--kv_updater ShiftPointer", runner_args, ] @@ -882,6 +884,7 @@ def post_process(): f"--model_path {pte_filename}.pte", f"--seq_len {seq_len}", "--output_path outputs/outputs.txt", + f"--performance_output_path {performance_output_path}", f"--kv_updater {'SmartMask' if args.kv_updater == smart_mask_updater else 'ShiftPointer'}", runner_args, ] @@ -905,7 +908,7 @@ def post_process(): adb.pull(output_path=args.artifact, callback=post_process) if args.ip and args.port != -1: inference_speed = 0 - with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f: + with open(f"{args.artifact}/{performance_output_path}", "r") as f: inference_speed = float(f.read()) pte_size = os.path.getsize(pte_path) diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp index 0a1635223e6..ab717aba9f8 100644 --- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp @@ -30,6 +30,10 @@ DEFINE_string( output_path, "outputs.txt", "Executorch inference data output path."); +DEFINE_string( + performance_output_path, + "inference_speed.txt", + "Records inference speed. For CI purpose."); DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff."); DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt."); DEFINE_string( @@ -63,6 +67,7 @@ int main(int argc, char** argv) { example::Runner runner( {FLAGS_model_path}, FLAGS_tokenizer_path.c_str(), + FLAGS_performance_output_path.c_str(), FLAGS_logits_scale, FLAGS_logits_offset, FLAGS_temperature, diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index da1997a5060..b9be77ce4db 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -34,13 +34,16 @@ namespace example { namespace { static constexpr auto kTopp = 0.9f; -void printReport(const Runner::Stats& stats); +void printReport( + const Runner::Stats& stats, + const std::string& performance_output_path); std::string statsToJsonString(const Runner::Stats& stats); } // namespace Runner::Runner( const std::vector& models_path, const std::string& tokenizer_path, + const std::string& performance_output_path, const float logits_scale, const int32_t logits_offset, const float temperature, @@ -49,6 +52,7 @@ Runner::Runner( : n_bos_(1), n_eos_(1), tokenizer_path_(tokenizer_path), + performance_output_path_(performance_output_path), logits_scale_(logits_scale), logits_offset_(logits_offset), temperature_(temperature), @@ -437,7 +441,7 @@ Error Runner::generate( stats_.num_prompt_tokens = num_prompt_tokens; stats_.num_generated_tokens = pos - num_prompt_tokens; - printReport(stats_); + printReport(stats_, performance_output_path_); if (stats_callback) { stats_callback(stats_); } @@ -446,7 +450,9 @@ Error Runner::generate( } namespace { -void printReport(const Runner::Stats& stats) { +void printReport( + const Runner::Stats& stats, + const std::string& performance_output_path) { printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str()); ET_LOG( @@ -507,7 +513,8 @@ void printReport(const Runner::Stats& stats) { // For now, we just print the total inference time for CI, can save more info // in future if needed. - std::ofstream outfile("outputs/inference_speed.txt"); + + std::ofstream outfile(performance_output_path.c_str()); if (outfile.is_open()) { double num_tok = (stats.num_generated_tokens) / (double)(stats.inference_end_ms - stats.inference_start_ms) * diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index e659ac55164..713a1d840ad 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -29,6 +29,7 @@ class Runner { explicit Runner( const std::vector& models_path, const std::string& tokenizer_path, + const std::string& performance_output_path_, const float logits_scale, const int32_t logits_offset, const float temperature, @@ -101,6 +102,7 @@ class Runner { const int32_t n_eos_; std::vector> modules_; std::string tokenizer_path_; + std::string performance_output_path_; float logits_scale_; int32_t logits_offset_; float temperature_; diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 4ecdaf3583f..47a489f6d52 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -169,7 +169,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train) dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val) - epochs = 5 + epochs = args.num_epochs dataloader_train = DataLoader( dataset_train, sampler=RandomSampler(dataset_train), @@ -366,6 +366,13 @@ def calibrator(gm): type=str, ) + parser.add_argument( + "--num_epochs", + help="If no pretrained weights are provided, set number of epochs to train the model", + default=5, + type=int, + ) + parser.add_argument( "-F", "--use_fp16", diff --git a/examples/qualcomm/scripts/wav2letter.py b/examples/qualcomm/scripts/wav2letter.py index e377c6d7e90..7f30d1865b8 100644 --- a/examples/qualcomm/scripts/wav2letter.py +++ b/examples/qualcomm/scripts/wav2letter.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import json +import logging import os import sys from multiprocessing.connection import Client @@ -111,7 +112,12 @@ def main(args): # target labels " abcdefghijklmnopqrstuvwxyz'*" instance.vocab_size = 29 model = instance.get_eager_model().eval() - model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True)) + if args.pretrained_weight: + model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True)) + else: + logging.warning( + "It is strongly recommended to provide pretrained weights, otherwise accuracy will be bad. This option is here mainly for CI purpose to ensure compile is successful." + ) # convert conv1d to conv2d in nn.Module level will only introduce 2 permute # nodes around input & output, which is more quantization friendly. @@ -128,9 +134,15 @@ def main(args): # retrieve dataset, will take some time to download data_num = 100 - inputs, targets, input_list = get_dataset( - data_size=data_num, artifact_dir=args.artifact - ) + if args.compile_only: + inputs = [(torch.rand(1, 1, 700, 1),)] + logging.warning( + "With compile_only, accuracy will be bad due to insufficient datasets for quantization." + ) + else: + inputs, targets, input_list = get_dataset( + data_size=data_num, artifact_dir=args.artifact + ) pte_filename = "w2l_qnn" build_executorch_binary( model, @@ -212,7 +224,7 @@ def main(args): ), default=None, type=str, - required=True, + required=False, ) args = parser.parse_args()