2 files changed
+2
-1
lines changedOriginal file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
66 | 66 |
| |
67 | 67 |
| |
68 | 68 |
| |
| 69 | + | |
69 | 70 |
| |
70 | 71 |
| |
71 | 72 |
| |
|
- .github/workflows/build_wheels_linux.yml+3-3
- .github/workflows/regression_test_rocm.yml+2
- .github/workflows/torchao_experimental_test.yml+4-8
- README.md+13-5
- benchmarks/float8/float8_roofline.py+3-3
- benchmarks/float8/profile_lowp_training.py+2-2
- benchmarks/float8/training/README.md+1
- benchmarks/float8/training/float8_training_benchmark.sh+2-1
- benchmarks/microbenchmarks/benchmark_inference.py+27-4
- benchmarks/microbenchmarks/benchmark_runner.py+63-8
- benchmarks/microbenchmarks/test/benchmark_config.yml+6-2
- benchmarks/microbenchmarks/test/test_benchmark_inference.py+66-1
- benchmarks/microbenchmarks/test/test_benchmark_runner.py+104
- benchmarks/microbenchmarks/test/test_utils.py+57-3
- benchmarks/microbenchmarks/utils.py+41-4
- benchmarks/mx_formats/cast_bench.py+46-2
- dev-requirements.txt+3
- test/prototype/mx_formats/test_custom_cast.py+22-1
- test/prototype/mx_formats/test_mx_linear.py+101-86
- test/prototype/mx_formats/test_mx_tensor.py+224-18
- test/quantization/test_galore_quant.py+2
- torchao/_models/llama/generate.py-1
- torchao/csrc/cuda/fp6_llm/fp6_linear.cu+38-14
- torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh+14-5
- torchao/dtypes/__init__.py+6
- torchao/dtypes/affine_quantized_tensor.py+12-8
- torchao/dtypes/affine_quantized_tensor_ops.py+20
- torchao/dtypes/uintx/__init__.py+10
- torchao/dtypes/uintx/packed_linear_int8_dynamic_activation_intx_weight_layout.py+450
- torchao/dtypes/uintx/q_dq_layout.py+52
- torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h+246-160
- torchao/experimental/kernels/cpu/aarch64/embedding/embedding.h+15-14
- torchao/experimental/kernels/cpu/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h+72-23
- torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h+297
- torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h+7-172
- torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h+7-165
- torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h+47-177
- torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h+8-8
- torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h+186-11
- torchao/experimental/kernels/cpu/aarch64/linear/linear.h-193
- torchao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h+384
- torchao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h+336
- torchao/experimental/kernels/cpu/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h+275
- torchao/experimental/kernels/cpu/aarch64/matmul/matmul.h+95
- torchao/experimental/kernels/cpu/aarch64/matmul/matmul_utils.h+70
- torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp+20-3
- torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt+9
- torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh+1
- torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp+89
- torchao/experimental/kernels/cpu/aarch64/tests/test_embedding.cpp+23-14
- torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp+428-223
- torchao/experimental/kernels/cpu/aarch64/tests/test_qmatmul.cpp+512
- torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h+201-28
- torchao/experimental/kernels/cpu/aarch64/tests/test_utils_quantized_attention.h+235
- torchao/experimental/kernels/cpu/aarch64/tests/test_weight_packing.cpp+37-26
- torchao/experimental/ops/embedding_xbit/op_embedding_xbit-impl.h+5-8
- torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h+238
- torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h+111-114
- torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp+118-298
- torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h+20-124
- torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h+46-66
- torchao/experimental/ops/linear_8bit_act_xbit_weight/packed_weights_format.h+3-3
- torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp+137-189
- torchao/experimental/packed_linear_int8_dynamic_activation_intx_weight_layout.py+15-421
- torchao/experimental/q_dq_layout.py+8-51
- torchao/experimental/quant_api.py+16-7
- torchao/experimental/tests/test_embedding_xbit_quantizer.py+1
- torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py+18-18
- torchao/float8/README.md+7-18
- torchao/ops.py+13
- torchao/prototype/mx_formats/README.md+90-68
- torchao/prototype/mx_formats/__init__.py+17
- torchao/prototype/mx_formats/config.py+104-45
- torchao/prototype/mx_formats/constants.py+8
- torchao/prototype/mx_formats/custom_cast.py+315-1
- torchao/prototype/mx_formats/mx_linear.py+99-86
- torchao/prototype/mx_formats/mx_ops.py+3-33
- torchao/prototype/mx_formats/mx_tensor.py+164-153
- torchao/testing/float8/roofline_utils.py+9-20
0 commit comments