pytorch · metascroy · Apr 8, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml
@@ -61,7 +61,7 @@ jobs:
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
       # Meta's macOS runners do not have Xcode, so use GitHub's runners.
-      runner-type: macos-latest-xlarge
+      runner-type: macos-15-xlarge
       setup-miniconda: true
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,8 @@ dependencies=[
   "ruamel.yaml",
   "sympy",
   "tabulate",
+  # Keep this version in sync with: third-party/ao pin
+  "torchao==0.10.0",
   "typing-extensions",
   # Keep this version in sync with: ./backends/apple/coreml/scripts/install_requirements.sh
   "coremltools==8.2; platform_system == 'Darwin'",
+3 −3		.github/workflows/build_wheels_linux.yml
+2 −0		.github/workflows/regression_test_rocm.yml
+4 −8		.github/workflows/torchao_experimental_test.yml
+13 −5		README.md
+3 −3		benchmarks/float8/float8_roofline.py
+2 −2		benchmarks/float8/profile_lowp_training.py
+1 −0		benchmarks/float8/training/README.md
+2 −1		benchmarks/float8/training/float8_training_benchmark.sh
+27 −4		benchmarks/microbenchmarks/benchmark_inference.py
+63 −8		benchmarks/microbenchmarks/benchmark_runner.py
+6 −2		benchmarks/microbenchmarks/test/benchmark_config.yml
+66 −1		benchmarks/microbenchmarks/test/test_benchmark_inference.py
+104 −0		benchmarks/microbenchmarks/test/test_benchmark_runner.py
+57 −3		benchmarks/microbenchmarks/test/test_utils.py
+41 −4		benchmarks/microbenchmarks/utils.py
+46 −2		benchmarks/mx_formats/cast_bench.py
+3 −0		dev-requirements.txt
+22 −1		test/prototype/mx_formats/test_custom_cast.py
+101 −86		test/prototype/mx_formats/test_mx_linear.py
+224 −18		test/prototype/mx_formats/test_mx_tensor.py
+2 −0		test/quantization/test_galore_quant.py
+0 −1		torchao/_models/llama/generate.py
+38 −14		torchao/csrc/cuda/fp6_llm/fp6_linear.cu
+14 −5		torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
+6 −0		torchao/dtypes/__init__.py
+12 −8		torchao/dtypes/affine_quantized_tensor.py
+20 −0		torchao/dtypes/affine_quantized_tensor_ops.py
+10 −0		torchao/dtypes/uintx/__init__.py
+450 −0		torchao/dtypes/uintx/packed_linear_int8_dynamic_activation_intx_weight_layout.py
+52 −0		torchao/dtypes/uintx/q_dq_layout.py
+246 −160		torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
+15 −14		torchao/experimental/kernels/cpu/aarch64/embedding/embedding.h
+72 −23		torchao/experimental/kernels/cpu/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h
+297 −0		...r/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
+7 −172		...els/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h
+7 −165		...els/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h
+47 −177		...els/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h
+8 −8		...erimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h
+186 −11		.../experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h
+0 −193		torchao/experimental/kernels/cpu/aarch64/linear/linear.h
+384 −0		torchao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h
+336 −0		...hao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h
+275 −0		torchao/experimental/kernels/cpu/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h
+95 −0		torchao/experimental/kernels/cpu/aarch64/matmul/matmul.h
+70 −0		torchao/experimental/kernels/cpu/aarch64/matmul/matmul_utils.h
+20 −3		torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
+9 −0		torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
+1 −0		torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
+89 −0		torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp
+23 −14		torchao/experimental/kernels/cpu/aarch64/tests/test_embedding.cpp
+428 −223		torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
+512 −0		torchao/experimental/kernels/cpu/aarch64/tests/test_qmatmul.cpp
+201 −28		torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
+235 −0		torchao/experimental/kernels/cpu/aarch64/tests/test_utils_quantized_attention.h
+37 −26		torchao/experimental/kernels/cpu/aarch64/tests/test_weight_packing.cpp
+5 −8		torchao/experimental/ops/embedding_xbit/op_embedding_xbit-impl.h
+238 −0		torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h
+111 −114		torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h
+118 −298		torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
+20 −124		torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h
+46 −66		torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
+3 −3		torchao/experimental/ops/linear_8bit_act_xbit_weight/packed_weights_format.h
+137 −189		torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp
+15 −421		torchao/experimental/packed_linear_int8_dynamic_activation_intx_weight_layout.py
+8 −51		torchao/experimental/q_dq_layout.py
+16 −7		torchao/experimental/quant_api.py
+1 −0		torchao/experimental/tests/test_embedding_xbit_quantizer.py
+18 −18		torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
+7 −18		torchao/float8/README.md
+13 −0		torchao/ops.py
+90 −68		torchao/prototype/mx_formats/README.md
+17 −0		torchao/prototype/mx_formats/__init__.py
+104 −45		torchao/prototype/mx_formats/config.py
+8 −0		torchao/prototype/mx_formats/constants.py
+315 −1		torchao/prototype/mx_formats/custom_cast.py
+99 −86		torchao/prototype/mx_formats/mx_linear.py
+3 −33		torchao/prototype/mx_formats/mx_ops.py
+164 −153		torchao/prototype/mx_formats/mx_tensor.py
+9 −20		torchao/testing/float8/roofline_utils.py