pytorch
diff --git a/‎.github/scripts/trymerge.py‎
Lines changed: 30 additions & 52 deletions b/‎.github/scripts/trymerge.py‎
Lines changed: 30 additions & 52 deletions
diff --git a/‎.github/scripts/validate_binaries.sh‎
Lines changed: 2 additions & 0 deletions b/‎.github/scripts/validate_binaries.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/trymerge.yml‎
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/trymerge.yml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎.github/workflows/validate-binaries.yml‎
Lines changed: 49 additions & 0 deletions b/‎.github/workflows/validate-binaries.yml‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 45 additions & 13 deletions b/‎README.md‎
Lines changed: 45 additions & 13 deletions
diff --git a/‎benchmarks/benchmark_aq.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_aq.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_fp6_llm.py‎
Lines changed: 14 additions & 15 deletions b/‎benchmarks/benchmark_fp6_llm.py‎
Lines changed: 14 additions & 15 deletions
diff --git a/‎benchmarks/intmm.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/intmm.py‎
Lines changed: 1 addition & 1 deletion
@@ -1163,7 +1163,6 @@ def merge_into(
             # Finally, upload the record to Rockset. The list of pending and failed
             # checks are at the time of the merge
             save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                 comment_id=comment_id,
                 pr_num=self.pr_num,
                 owner=self.org,
@@ -1179,10 +1178,8 @@ def merge_into(
                 merge_base_sha=self.get_merge_base(),
                 merge_commit_sha=merge_commit_sha,
                 is_failed=False,
-                dry_run=dry_run,
                 skip_mandatory_checks=skip_mandatory_checks,
                 ignore_current=bool(ignore_current_checks),
-                workspace=ROCKSET_MERGES_WORKSPACE,
             )
         else:
             print("Missing comment ID or PR number, couldn't upload to Rockset")
@@ -1489,7 +1486,6 @@ def checks_to_markdown_bullets(
 
 @retries_decorator()
 def save_merge_record(
-    collection: str,
     comment_id: int,
     pr_num: int,
     owner: str,
@@ -1505,59 +1501,44 @@ def save_merge_record(
     merge_base_sha: str,
     merge_commit_sha: str = "",
     is_failed: bool = False,
-    dry_run: bool = False,
     skip_mandatory_checks: bool = False,
     ignore_current: bool = False,
     error: str = "",
-    workspace: str = "commons",
 ) -> None:
     """
-    This saves the merge records into Rockset, so we can query them (for fun and profit)
+    This saves the merge records as a json, which can later be uploaded to s3
     """
-    if dry_run:
-        # Decide not to save the record to Rockset if dry-run is set to not pollute
-        # the collection
-        return
-
-    try:
-        import rockset  # type: ignore[import]
-
-        # Prepare the record to be written into Rockset
-        data = [
-            {
-                "comment_id": comment_id,
-                "pr_num": pr_num,
-                "owner": owner,
-                "project": project,
-                "author": author,
-                "pending_checks": pending_checks,
-                "failed_checks": failed_checks,
-                "ignore_current_checks": ignore_current_checks,
-                "broken_trunk_checks": broken_trunk_checks,
-                "flaky_checks": flaky_checks,
-                "unstable_checks": unstable_checks,
-                "last_commit_sha": last_commit_sha,
-                "merge_base_sha": merge_base_sha,
-                "merge_commit_sha": merge_commit_sha,
-                "is_failed": is_failed,
-                "skip_mandatory_checks": skip_mandatory_checks,
-                "ignore_current": ignore_current,
-                "error": error,
-            }
-        ]
 
-        client = rockset.RocksetClient(
-            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-        )
-        client.Documents.add_documents(
-            collection=collection,
-            data=data,
-            workspace=workspace,
-        )
+    # Prepare the record to be written into Rockset
+    data = [
+        {
+            "comment_id": comment_id,
+            "pr_num": pr_num,
+            "owner": owner,
+            "project": project,
+            "author": author,
+            "pending_checks": pending_checks,
+            "failed_checks": failed_checks,
+            "ignore_current_checks": ignore_current_checks,
+            "broken_trunk_checks": broken_trunk_checks,
+            "flaky_checks": flaky_checks,
+            "unstable_checks": unstable_checks,
+            "last_commit_sha": last_commit_sha,
+            "merge_base_sha": merge_base_sha,
+            "merge_commit_sha": merge_commit_sha,
+            "is_failed": is_failed,
+            "skip_mandatory_checks": skip_mandatory_checks,
+            "ignore_current": ignore_current,
+            "error": error,
+            # This is a unique identifier for the record for deduping purposes
+            # in rockset.  Any unique string would work
+            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
+        }
+    ]
+    repo_root = Path(__file__).resolve().parent.parent.parent
 
-    except ModuleNotFoundError:
-        print("Rockset is missing, no record will be saved")
-        return
+    with open(repo_root / "merge_record.json", "w") as f:
+        json.dump(data, f)
 
 
 @retries_decorator(rc=[])
@@ -2374,7 +2355,6 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
             # list of pending and failed checks here, but they are not really
             # needed at the moment
             save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                 comment_id=args.comment_id,
                 pr_num=args.pr_num,
                 owner=org,
@@ -2389,11 +2369,9 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
                 last_commit_sha=pr.last_commit().get("oid", ""),
                 merge_base_sha=pr.get_merge_base(),
                 is_failed=True,
-                dry_run=args.dry_run,
                 skip_mandatory_checks=args.force,
                 ignore_current=args.ignore_current,
                 error=str(e),
-                workspace=ROCKSET_MERGES_WORKSPACE,
             )
         else:
             print("Missing comment ID or PR number, couldn't upload to Rockset")
 
@@ -0,0 +1,2 @@
+pip install ${PYTORCH_PIP_PREFIX} torchao --index-url ${PYTORCH_PIP_DOWNLOAD_URL}
+python  ./test/smoke_tests/smoke_tests.py
@@ -9,6 +9,8 @@ jobs:
     name: try_merge_pr_${{ github.event.client_payload.pr_num }}
     runs-on: ubuntu-latest
     environment: pytorchbot-env
+    permissions:
+      id-token: write
     env:
       GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
     steps:
@@ -45,6 +47,7 @@ jobs:
           IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
           ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
           DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
         run: |
           set -x
           if [ -n "${FORCE}" ]; then
@@ -65,6 +68,22 @@ jobs:
             python3 .github/scripts/trymerge.py "${PR_NUM}"
           fi
 
+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        continue-on-error: true
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Upload merge record to s3
+        if: always()
+        continue-on-error: true
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: ossci-raw-job-status
+          s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }}
+          path: merge_record.json
+
 # We want newer merge commands to supercede old ones
 concurrency:
   group: try-merge-${{ github.event.client_payload.pr_num }}
 
@@ -0,0 +1,49 @@
+name: Validate binaries
+
+on:
+  workflow_call:
+    inputs:
+      channel:
+        description: "Channel to use (nightly, test, release, all)"
+        required: false
+        type: string
+        default: release
+      ref:
+        description: "Reference to checkout, defaults to empty"
+        default: ""
+        required: false
+        type: string
+  workflow_dispatch:
+    inputs:
+      channel:
+        description: "Channel to use (nightly, test, release, all)"
+        required: true
+        type: choice
+        options:
+          - release
+          - nightly
+          - test
+          - all
+      ref:
+        description: "Reference to checkout, defaults to empty"
+        default: ""
+        required: false
+        type: string
+      pytorch_version:
+        description: "PyTorch version to validate (ie. 2.0, 2.2.2, etc.) - optional"
+        default: ""
+        required: false
+        type: string
+jobs:
+  validate-binaries:
+    uses: pytorch/test-infra/.github/workflows/validate-domain-library.yml@main
+    with:
+      package_type: "wheel"
+      version: ${{ inputs.version }}
+      os: "linux"
+      channel: ${{ inputs.channel }}
+      repository: "pytorch/ao"
+      with_cuda: "enable"
+      with_rocm: "disable"
+      smoke_test: "source ./.github/scripts/validate_binaries.sh"
+      install_torch: true
@@ -127,6 +127,7 @@ env
 .circleci/scripts/COMMIT_MSG
 scripts/release_notes/*.json
 sccache-stats*.json
+merge_record.json
 
 # These files get copied over on invoking setup.py
 torchgen/packaged/*
 
@@ -29,15 +29,17 @@ The models used were `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Meta-Llama-
 
 | Model       | Technique          | wikitext-perplexity | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
 | ----------- | ------------------ | ------------------- | ------------- | ----------------------- | ---------------- | --------------- |
-| Llama-2-7B  | Base (bfloat16)    | 12.212              |  105.02       | 1387.78                 | 13.21            | 13.90           |
-|             | int8dq             | 12.262              |  9.40         | 62.26                   | 6.62             | 8.61            |
-|             | int8wo             | 12.204              |  147.03       | 973.54                  | 6.62             | 8.95            |
-|             | int4wo-64          | 12.843              |  199.81       | 746.45                  | 3.74             | 4.75            |
-|             | int4wo-64-GPTQ     | 12.489              |  199.81       | 746.45                  | 3.74             | 4.75            |
-| Llama-3-8B  | Base (bfloat16)    |                  |  94.91        | 1424.58                 | 15.01            | 16.43           |
-|             | int8dq             |                  |  8.41         | 63.23                   | 7.52             | 9.24            |
-|             | int8wo             |                  |  136.75       | 1028.38                 | 7.52             | 10.42           |
-|             | int4wo-64          |                  |  179.41       | 757.45                  | 4.22             | 6.88            |
+| Llama-2-7B  | Base (bfloat16)    | 12.212              |  105.14       | 1389.35                 | 13.88            | 13.21           |
+|             | int8dq             | 12.262              |    9.20       |   60.93                 |  8.33            |  6.62           |
+|             | int8wo             | 12.204              |  150.18       |  994.40                 |  8.95            |  6.62           |
+|             | int4wo-64          | 12.843              |  199.86       |  746.66                 |  4.50            |  3.74           |
+|             | int4wo-64-GPTQ     | 12.489              |  199.86       |  746.66                 |  4.50            |  3.74           |
+|             | autoquant          | 12.204              |  159.22       | 1069.87                 |  8.91            |  6.72           |
+| Llama-3-8B  | Base (bfloat16)    | N/A                 |   94.97       | 1425.55                 | 16.43            | 15.01           |
+|             | int8dq             | N/A                 |    8.44       |   63.45                 |  8.98            |  7.52           |
+|             | int8wo             | N/A                 |  139.76       | 1051.02                 | 10.42            |  7.52           |
+|             | int4wo-64          | N/A                 |  179.44       |  757.60                 |  6.62            |  4.22           |
+|             | autoquant          | N/A                 |  137.71       | 1037.74                 | 11.08            |  7.54           |
 
 note: Int8 dynamic quantization works best on compute bound as opposed to memory bound models. Some relatable examples might be [SAM](https://github.com/pytorch-labs/segment-anything-fast) which is compute bound vs Llama at batchsize=1 which is memory bound.
 
@@ -81,7 +83,7 @@ swap_linear_with_semi_sparse_linear(model, {"seq.0": SemiSparseLinear})
 
 * [MX](torchao/prototype/mx_formats) implementing training and inference support with tensors using the [OCP MX spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) data types, which can be described as groupwise scaled float8/float6/float4/int8, with the scales being constrained to powers of two. This work is prototype as the hardware support is not available yet.
 * [nf4](torchao/dtypes/nf4tensor.py) which was used to [implement QLoRA](https://github.com/pytorch/torchtune/blob/main/docs/source/tutorials/qlora_finetune.rst) one of the most popular finetuning algorithms without writing custom Triton or CUDA code. Accessible talk [here](https://x.com/HamelHusain/status/1800315287574847701)
-* [fp6](torchao/prototype/fp6_llm/) for 2x faster inference over fp16 with an easy to use wrapper api `convert_fp6_llm(model)`
+* [fp6](torchao/prototype/quant_llm/) for 2x faster inference over fp16 with an easy to use API `quantize(model, fp6_llm_weight_only())`
 
 ## Composability
 
@@ -92,11 +94,34 @@ A key design principle for us is composability as in any new dtype or layout we
 
 
 ### Installation
+
 `torchao` makes liberal use of several new features in Pytorch, it's recommended to use it with the current nightly or latest stable version of PyTorch.
 
-Stable Release
+#### Install torch
+
+Install torch stable
+
+```
+pip install torch
+```
+
+Or torch nightlies
+
+```
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+```
+
+#### Install torchao
+
+Stable release from Pypi which will default to CUDA 12.1
+
 ```Shell
-pip install torchao --extra-index-url https://download.pytorch.org/whl/test/cu121 # full options are cpu/cu118/cu121/cu124
+pip install torchao
+```
+
+Stable Release from the PyTorch index
+```Shell
+pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124
 ```
 
 Nightly Release
@@ -117,10 +142,17 @@ python setup.py install
     * [GaLore](torchao/prototype/galore/) a drop for the Adam Optimizer that allows you to finetune llama 7b on a single 4090 card with up to 70% speedups relative to eager PyTorch
     * [DoRA](torchao/prototype/dora) a newer replacement for QLoRA with more promising convergence characteristics
     * [Fused int4/fp16 Quant Matmul](torchao/prototype/hqq) which is particularly useful for compute bound kernels showing 4x speedups over tinygemm for larger batch sizes such as 512
-* [gau-nernst](https://github.com/gau-nernst) fp6 kernels that are 4x faster than fp16 [torchao/prototype/fp6_llm](torchao/prototype/fp6_llm)
+* [gau-nernst](https://github.com/gau-nernst) fp6 kernels that are 4x faster than fp16 [torchao/prototype/quant_llm](torchao/prototype/quant_llm)
 * [vayuda](https://github.com/vayuda) with generic bitpacking kernels that were code generated using pure PyTorch [prototype/common](torchao/prototype/common)
 * [andreaskopf](https://github.com/andreaskoepf) and [melvinebenezer](https://github.com/melvinebenezer) with [1 bit LLMs](torchao/prototype/dtypes) Bitnet 1.58 bitpacked into uint2 and fully code-generated with torch.compile
 
+## Blogs and Videos
+* [Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity](https://pytorch.org/blog/accelerating-neural-network-training/)
+* [https://mobiusml.github.io/whisper-static-cache-blog/](https://mobiusml.github.io/whisper-static-cache-blog/)
+* [Slaying OOMs at the Mastering LLM's course](https://x.com/HamelHusain/status/1800315287574847701)
+* [Advanced Quantization at CUDA MODE](https://youtu.be/1u9xUK3G4VM?si=4JcPlw2w8chPXW8J)
+* [Chip Huyen's GPU Optimization Workshop](https://www.youtube.com/live/v_q2JTIqE20?si=mf7HeZ63rS-uYpS6)
+
 ## How to contribute
 
 This repository is currently under heavy development
 
@@ -5,7 +5,7 @@
     Int8WeightOnlyQuantizedLinearWeight,
     Int4WeightOnlyQuantizedLinearWeight,
 )
-from torchao.quantization.utils import (
+from torchao.utils import (
     TORCH_VERSION_AFTER_2_4,
 )
 from torchao.quantization.quant_api import (
 
@@ -1,25 +1,24 @@
 import torch
-from torch import nn
-from torchao.prototype.fp6_llm.fp6_llm import Fp6LlmLinear, from_tc_float6_e3m2
-from torch.utils.benchmark import Timer
 import pandas as pd
+import torch.nn.functional as F
+from torchao.prototype.quant_llm import QuantLlmLinearWeight
+from torchao.utils import benchmark_torch_function_in_microseconds
 from tqdm import tqdm
 
 
 def benchmark(m: int, k: int, n: int):
-    fp6_weight = torch.randint(256, size=(n, k * 3 // 4), dtype=torch.uint8, device="cuda")
-    scales = torch.rand(n, dtype=torch.half, device="cuda") + 0.5
-    fp6_linear = Fp6LlmLinear(fp6_weight, scales)
+    fp6_data = torch.randint(256, size=(n, k * 3 // 4), dtype=torch.uint8, device="cuda")
+    scale = torch.rand(n, dtype=torch.half, device="cuda") + 0.5
+    fp6_weight = QuantLlmLinearWeight(fp6_data, scale, 3, 2)
 
-    fp16_linear = nn.Linear(k, n, bias=True, dtype=torch.half, device="cuda")
-    fp16_linear.weight.data = from_tc_float6_e3m2(fp6_weight, dtype=torch.half) * scales[:, None]
+    fp16_weight = fp6_weight.dequantize(torch.half)
 
     fp16_act = torch.randn(m, k, dtype=torch.half, device="cuda")
-    fp6_output = fp6_linear(fp16_act)
-    fp16_output = fp16_linear(fp16_act)
+    fp6_output = F.linear(fp16_act, fp6_weight)
+    fp16_output = F.linear(fp16_act, fp16_weight)
 
-    fp6_measurement = Timer(stmt="fp6_linear(fp16_act)", globals=locals()).blocked_autorange()
-    fp16_measurement = Timer(stmt="fp16_linear(fp16_act)", globals=locals()).blocked_autorange()
+    fp6_time = benchmark_torch_function_in_microseconds(F.linear, fp16_act, fp6_weight)
+    fp16_time = benchmark_torch_function_in_microseconds(F.linear, fp16_act, fp16_weight)
 
     # follow https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/tests/python/kernel_test.py
     # doesn't seem to be the right way to check for correctness
@@ -29,9 +28,9 @@ def benchmark(m: int, k: int, n: int):
         "m": m,
         "k": k,
         "n": n,
-        "fp6_latency (ms)": fp6_measurement.median * 1000,
-        "fp16_latency (ms)": fp16_measurement.median * 1000,
-        "speedup (d/s)": fp16_measurement.median / fp6_measurement.median,
+        "fp6_latency (ms)": fp6_time,
+        "fp16_latency (ms)": fp16_time,
+        "speedup (d/s)": fp16_time / fp6_time,
         "correct": correct,
     }
 
 
@@ -6,7 +6,7 @@
 import pathlib
 
 import torch
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4, TORCH_VERSION_AFTER_2_2
+from torchao.utils import TORCH_VERSION_AFTER_2_4, TORCH_VERSION_AFTER_2_2
 
 
 # Check if CUDA is available, if not, exit the script
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+pip install ${PYTORCH_PIP_PREFIX} torchao --index-url ${PYTORCH_PIP_DOWNLOAD_URL}`
	`2`	`+python ./test/smoke_tests/smoke_tests.py`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`Int8WeightOnlyQuantizedLinearWeight,`
`6`	`6`	`Int4WeightOnlyQuantizedLinearWeight,`
`7`	`7`	`)`
`8`		`-from torchao.quantization.utils import (`
	`8`	`+from torchao.utils import (`
`9`	`9`	`TORCH_VERSION_AFTER_2_4,`
`10`	`10`	`)`
`11`	`11`	`from torchao.quantization.quant_api import (`