pytorch
diff --git a/‎backends/qualcomm/_passes/layout_transform.py
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/_passes/layout_transform.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 2 additions & 4 deletions b/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py
Lines changed: 4 additions & 7 deletions b/‎backends/qualcomm/quantizer/custom_annotation.py
Lines changed: 4 additions & 7 deletions
diff --git a/‎backends/qualcomm/scripts/build.sh
Lines changed: 0 additions & 8 deletions b/‎backends/qualcomm/scripts/build.sh
Lines changed: 0 additions & 8 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 2 additions & 61 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 2 additions & 61 deletions
diff --git a/‎examples/qualcomm/CMakeLists.txt
Lines changed: 2 additions & 2 deletions b/‎examples/qualcomm/CMakeLists.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Lines changed: 0 additions & 9 deletions b/‎examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Lines changed: 0 additions & 9 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/README.md
Lines changed: 1 addition & 2 deletions b/‎examples/qualcomm/oss_scripts/llama/README.md
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/hf_converter/convert_config.py
Lines changed: 0 additions & 45 deletions b/‎examples/qualcomm/oss_scripts/llama/hf_converter/convert_config.py
Lines changed: 0 additions & 45 deletions
@@ -101,8 +101,8 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.pow.Tensor_Scalar,
         exir_ops.edge.aten.prelu.default,
         exir_ops.edge.aten.repeat.default,
-        exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten.round.default,
+        exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten.sigmoid.default,
         exir_ops.edge.aten.split_with_sizes.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
 
@@ -275,9 +275,7 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
     )
 
 
-@register_annotator(
-    [torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
-)
+@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
 def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
@@ -1300,7 +1298,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
     )
 
 
-@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
+@register_annotator([torch.ops.aten.zeros.default])
 def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]) or not _is_float_tensor(node):
         return
 
@@ -153,9 +153,7 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
                 )
 
 
-def annotate_matmul_16a8w(  # noqa: C901
-    gm: torch.fx.GraphModule, annotate_conv=True
-) -> None:
+def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
     """
     This function is specific for matmul op 16a8w.
     For k, we will tag such as the below, and
@@ -319,10 +317,9 @@ def annotate_matmul_input1(node: Node):
                 # The arguments of cat op: (the past kv cache, the new kv cache)
                 node = node.args[0][1]
             elif node.target == torch.ops.aten.conv2d.default:
-                if annotate_conv:
-                    annotate_conv2d(
-                        node, quantization_config=quantization_config_8a4w_per_channel
-                    )
+                annotate_conv2d(
+                    node, quantization_config=quantization_config_8a4w_per_channel
+                )
                 break
             elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
                 break
 
@@ -85,7 +85,6 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI='arm64-v8a' \
@@ -105,9 +104,6 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_PLATFORM=android-30 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
-        -DSUPPORT_REGEX_LOOKAHEAD=ON \
-        -DBUILD_TESTING=OFF \
-        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
@@ -138,7 +134,6 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -S $PRJ_ROOT \
         -B $BUILD_ROOT \
@@ -162,9 +157,6 @@ if [ "$BUILD_X86_64" = true ]; then
        -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
        -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
        -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-       -DSUPPORT_REGEX_LOOKAHEAD=ON \
-       -DBUILD_TESTING=OFF \
-       -DEXECUTORCH_ENABLE_LOGGING=ON \
        -B$EXAMPLE_ROOT
 
    cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
 
@@ -3999,7 +3999,7 @@ def test_llama3_2_1b(self):
             "16a4w",
             "--temperature",
             "0",
-            "--decoder_model",
+            "--llama_model",
             "llama3_2",
             "--model_mode",
             "hybrid",
@@ -4079,7 +4079,7 @@ def test_llama_stories_110m(self):
             "16a4w",
             "--temperature",
             "0",
-            "--decoder_model",
+            "--llama_model",
             "stories110m",
             "--model_mode",
             "hybrid",
@@ -4121,65 +4121,6 @@ def test_llama_stories_110m(self):
                 if not self.compile_only and not self.enable_x86_64:
                     self.assertGreaterEqual(msg["inference_speed"], 220)  # Lanai
 
-    def test_qwen2_5(self):
-        if not self.required_envs():
-            self.skipTest("missing required envs")
-
-        prompt = "My favourite condiment is "
-        cmds = [
-            "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
-            "--artifact",
-            self.artifact_dir,
-            "--build_folder",
-            self.build_folder,
-            "--model",
-            self.model,
-            "--ip",
-            self.ip,
-            "--port",
-            str(self.port),
-            "--prompt",
-            f"{prompt}",
-            "--ptq",
-            "16a8w",
-            "--decoder_model",
-            "qwen2_5",
-            "--model_mode",
-            "hybrid",
-            "--prefill_ar_len",
-            "32",
-            "--max_seq_len",
-            "128",
-        ]
-        if self.compile_only:
-            cmds.extend(["--compile_only"])
-        elif self.device:
-            cmds.extend(["--device", self.device])
-        if self.host:
-            cmds.extend(["--host", self.host])
-        elif self.enable_x86_64:
-            cmds.extend(["--enable_x86_64"])
-        if self.pre_gen_pte:
-            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
-
-        # Accuracy is bad for now. Just check user's prompt is returned.
-        golden_start_with = "My favourite condiment is "
-        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
-        with Listener((self.ip, self.port)) as listener:
-            conn = listener.accept()
-            p.communicate()
-            msg = json.loads(conn.recv())
-            if "Error" in msg:
-                self.fail(msg["Error"])
-            else:
-                model_out = msg["result"][0]
-                self.assertTrue(
-                    model_out.startswith(golden_start_with),
-                    f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
-                )
-                self.assertGreaterEqual(msg["inference_speed"], 95)  # Lanai
-
 
 class TestExampleOssScript(TestQNN):
     def test_albert(self):
 
@@ -77,8 +77,8 @@ target_include_directories(
 
 # add tokenizers
 add_subdirectory(
-  ${EXECUTORCH_ROOT}/extension/llm/runner
-  ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner
+  ${EXECUTORCH_ROOT}/extension/llm/tokenizers
+  ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers
 )
 
 # build qnn_executor_runner
 
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
 # model sharding with custom op
 set(CUSTOM_OP_SRCS_FILE
     "${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
@@ -64,22 +63,14 @@ target_link_libraries(
   executorch_core
   extension_data_loader
   extension_flat_tensor
-  extension_llm_runner
   extension_module
   extension_tensor
-  tokenizers
   gflags
   custom_ops
   quantized_ops_lib
   quantized_kernels
   tokenizers
 )
-
-target_include_directories(
-  qnn_llama_runner
-  PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
-)
-
 target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
 set_target_properties(
   qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 
@@ -1,11 +1,10 @@
 # Summary
 
 ## Overview
-This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models:
+This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models:
  1. LLAMA2 Stories 110M
  2. LLAMA3.2 1B
  3. LLAMA3.2 3B
- 4. QWEN2.5 0.5B
 
 We offer the following modes to execute the model:
Original file line number	Diff line number	Diff line change
`@@ -275,9 +275,7 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->`
`275`	`275`	`)`
`276`	`276`
`277`	`277`
`278`		`-@register_annotator(`
`279`		`- [torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]`
`280`		`-)`
	`278`	`+@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])`
`281`	`279`	`def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:`
`282`	`280`	`annotate_binary(node, quantization_config)`
`283`	`281`
`@@ -1300,7 +1298,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:`
`1300`	`1298`	`)`
`1301`	`1299`
`1302`	`1300`
`1303`		`-@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])`
	`1301`	`+@register_annotator([torch.ops.aten.zeros.default])`
`1304`	`1302`	`def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:`
`1305`	`1303`	`if _is_annotated([node]) or not _is_float_tensor(node):`
`1306`	`1304`	`return`
Original file line number	Diff line number	Diff line change
`@@ -77,8 +77,8 @@ target_include_directories(`
`77`	`77`
`78`	`78`	`# add tokenizers`
`79`	`79`	`add_subdirectory(`
`80`		`- ${EXECUTORCH_ROOT}/extension/llm/runner`
`81`		`- ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner`
	`80`	`+ ${EXECUTORCH_ROOT}/extension/llm/tokenizers`
	`81`	`+ ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers`
`82`	`82`	`)`
`83`	`83`
`84`	`84`	`# build qnn_executor_runner`