pytorch · vkuzo · Dec 18, 2025 · Dec 18, 2025 · Dec 19, 2025
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -49,37 +49,6 @@ def test_run_inference(self, mock_string_to_config):
             hasattr(result, "quantized_model_compiled_inference_time_in_ms")
         )
 
-    @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
-    def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
-        """Test running inference with sparsity configurations"""
-        # Mock string_to_config to return valid configs
-        from torchao.dtypes import MarlinSparseLayout
-        from torchao.quantization import Int4WeightOnlyConfig
-
-        # Test with semi-sparse config
-        mock_string_to_config.return_value = Int4WeightOnlyConfig(
-            layout=MarlinSparseLayout(),
-            version=1,
-        )
-        config = BenchmarkConfig(
-            quantization="marlin",
-            sparsity="semi-sparse",
-            params={
-                "high_precision_dtype": "torch.float32",
-                "device": "cpu",
-                "model_type": "linear",
-            },
-            shape_name="custom",
-            shape=[64, 64, 64],  # Use dimensions divisible by 64
-            output_dir=self.temp_dir,
-            benchmark_mode="inference",
-        )
-        result = run(config)
-        self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(
-            hasattr(result, "quantized_model_compiled_inference_time_in_ms")
-        )
-
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
     def test_run_inference_with_block_sparsity(self, mock_string_to_config):
         """Test running inference with sparsity configurations"""

diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
@@ -19,7 +19,6 @@
     Float8WeightOnlyConfig,
     FPXWeightOnlyConfig,
     GemliteUIntXWeightOnlyConfig,
-    Int4WeightOnlyConfig,
     Int8DynamicActivationInt4WeightConfig,
     Int8DynamicActivationInt8WeightConfig,
     Int8WeightOnlyConfig,
@@ -195,18 +194,6 @@ def string_to_config(
             return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True)
         else:
             return Int8DynamicActivationInt8WeightConfig()
-    if "int4wo" in quantization:
-        use_hqq = False
-        if "hqq" in quantization:
-            use_hqq = True
-        group_size = int(quantization.split("-")[1])
-        assert group_size in [
-            32,
-            64,
-            128,
-            256,
-        ], f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
-        return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq, version=1)
     elif "int8adq-int4w-symm" in quantization:
         from torchao.dtypes import CutlassInt4PackedLayout
 
@@ -226,10 +213,6 @@ def string_to_config(
                 act_mapping_type=MappingType.SYMMETRIC,
                 layout=MarlinQQQLayout(),
             )
-        elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
-            from torchao.dtypes import MarlinSparseLayout
-
-            return Int4WeightOnlyConfig(layout=MarlinSparseLayout(), version=1)
     if "fp6" in quantization:
         return FPXWeightOnlyConfig(3, 2)
     elif "uintx" in quantization:

diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
@@ -36,7 +36,7 @@ Here is the serialization and deserialization flow::
   print(f"original model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")
 
   example_inputs = m.example_inputs(dtype=dtype, device="cuda")
-  quantize_(m, Int4WeightOnlyConfig(version=1))
+  quantize_(m, Int4WeightOnlyConfig())
   print(f"quantized model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")
 
   ref = m(*example_inputs)

diff --git a/docs/source/torchao_vllm_integration.md b/docs/source/torchao_vllm_integration.md
@@ -44,8 +44,6 @@ from torchao.quantization import Int4WeightOnlyConfig
 # Example configuration
 config = Int4WeightOnlyConfig(
     group_size=128,
-    use_hqq=True,
-    version=1,
 )
 assert isinstance(config, AOBaseConfig)
 ```
@@ -66,7 +64,7 @@ config = FqnToConfig({
     "model.layers.0.self_attn.q_proj": Int4WeightOnlyConfig(group_size=64),
     "model.layers.0.self_attn.k_proj": Int4WeightOnlyConfig(group_size=64),
     "model.layers.0.mlp.gate_proj": Int8WeightOnlyConfig(),
-    "_default": Int4WeightOnlyConfig(group_size=128, version=1)  # Default for other modules
+    "_default": Int4WeightOnlyConfig(group_size=128)  # Default for other modules
 })
 ```
 (usage-examples)=
@@ -81,7 +79,7 @@ from torchao.quantization import Int4WeightOnlyConfig
 
 # Create quantization configuration
 quantization_config = TorchAoConfig(
-    quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1)
+    quant_type=Int4WeightOnlyConfig(group_size=128)
 )
 
 # Load and automatically quantize the model

diff --git a/scripts/quick_start.py b/scripts/quick_start.py
@@ -38,8 +38,7 @@ def forward(self, x):
 # | torchao quantization |
 # ========================
 
-# torch 2.4+ only
-quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
+quantize_(model, Int4WeightOnlyConfig(group_size=32))
 
 
 # =============

diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -17,8 +17,6 @@
 from torchao.core.config import AOBaseConfig
 from torchao.dtypes import (
     CutlassInt4PackedLayout,
-    Int4CPULayout,
-    Int4XPULayout,
     PlainLayout,
     SemiSparseLayout,
     to_affine_quantized_intx,
@@ -28,14 +26,13 @@
     Float8WeightOnlyConfig,
     GemliteUIntXWeightOnlyConfig,
     Int4DynamicActivationInt4WeightConfig,
-    Int4WeightOnlyConfig,
     Int8DynamicActivationInt4WeightConfig,
     Int8DynamicActivationInt8WeightConfig,
     Int8WeightOnlyConfig,
     quantize_,
 )
-from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
-from torchao.testing.utils import skip_if_no_cuda, skip_if_no_gemlite, skip_if_rocm
+from torchao.quantization.quant_primitives import MappingType
+from torchao.testing.utils import skip_if_no_gemlite, skip_if_rocm
 from torchao.utils import (
     check_cpu_version,
     check_xpu_version,
@@ -62,24 +59,10 @@ def get_quantization_functions(
     ]
     if do_int4:
         if check_cpu_version(device):
-            base_functions.append(
-                Int4WeightOnlyConfig(group_size=32, layout=Int4CPULayout(), version=1)
-            )
+            pass
         elif check_xpu_version(device):
-            base_functions.append(
-                Int4WeightOnlyConfig(group_size=32, layout=Int4XPULayout(), version=1)
-            )
-            if int4_zp_int:
-                base_functions.append(
-                    Int4WeightOnlyConfig(
-                        group_size=32,
-                        layout=Int4XPULayout(),
-                        zero_point_domain=ZeroPointDomain.INT,
-                        version=1,
-                    )
-                )
+            pass
         else:
-            base_functions.append(Int4WeightOnlyConfig(group_size=32, version=1))
             if device == "cuda" and not is_ROCM():
                 base_functions.append(
                     Int8DynamicActivationInt4WeightConfig(
@@ -107,26 +90,6 @@ class TestAffineQuantized(TestCase):
         ["xpu"] if torch.xpu.is_available() else []
     )
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
-    def test_tensor_core_layout_transpose(self):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
-        t = linear.weight
-        shape = t.shape
-        apply_int4_weight_only_quant = Int4WeightOnlyConfig(group_size=32, version=1)
-        quantize_(linear, apply_int4_weight_only_quant)
-        ql = linear
-        aqt = ql.weight
-        aqt_shape = aqt.shape
-        self.assertEqual(aqt_shape, shape)
-
-        # transpose shape test
-        for _ in range(10):
-            t = t.t()
-            aqt = aqt.t()
-            shape = t.shape
-            aqt_shape = aqt.shape
-            self.assertEqual(aqt_shape, shape)
-
     @unittest.skipIf(len(GPU_DEVICES) == 0, "Need GPU available")
     def test_weights_only(self):
         for device in self.GPU_DEVICES:
@@ -338,20 +301,6 @@ def test_alias(self, device, dtype):
         quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
         _ = dummy.weight[...]
 
-    @common_utils.parametrize("device", [_DEVICE])
-    @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
-    @skip_if_rocm("ROCm enablement in progress")
-    def test_slice_int4wo(self, device, dtype):
-        # in_feature not divisible by 1024
-        # out_feature not divisible by 8
-        # to test slice + padding for int4 weight only quantization
-        dummy = nn.Linear(256, 321, dtype=dtype, device=device)
-        quantize_(dummy, Int4WeightOnlyConfig(version=1))
-        # make sure these run without error
-        _ = dummy.weight.narrow(0, 0, 64)
-        _ = dummy.weight.narrow(1, 0, 128)
-
     @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
@@ -452,58 +401,6 @@ def test_matmul(self, device, dtype):
         # make sure it runs
         torch.matmul(x, w.t())
 
-    @common_utils.parametrize("device", [_DEVICE])
-    @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
-    @skip_if_rocm("ROCm enablement in progress")
-    def test_slice_and_copy_int4wo(self, device, dtype):
-        l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
-        l.weight = torch.nn.Parameter(
-            torch.zeros(1024, 1024, dtype=torch.bfloat16, device=_DEVICE)
-        )
-        quantize_(l, Int4WeightOnlyConfig(version=1))
-        param = l.weight
-        param_data = param.data
-        param_data = param_data.narrow(0, 0, 512)
-        assert (
-            param.data.tensor_impl.packed_weight.data_ptr()
-            == param_data.tensor_impl.packed_weight.data_ptr()
-        )
-        assert (
-            param.data.tensor_impl.scale_and_zero.data_ptr()
-            == param_data.tensor_impl.scale_and_zero.data_ptr()
-        )
-        assert param.data.dequantize()[0][0] == 0
-
-        # dummy_l has random input (shouldn't be 0)
-        dummy_l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
-        quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
-        quantized = dummy_l.weight
-        quantized = quantized.narrow(0, 0, 512)
-
-        param_data.copy_(quantized)
-
-        # making sure param.data is updated
-        assert param.data.dequantize()[0][0] != 0
-
-    @common_utils.parametrize("device", [_DEVICE])
-    @common_utils.parametrize("dtype", [torch.bfloat16])
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
-    @skip_if_rocm("ROCm enablement in progress")
-    def test_mm_int4wo(self, device, dtype):
-        weight = torch.randn(512, 1024).to(device).to(dtype)
-        weight = weight.t()
-
-        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
-        l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Int4WeightOnlyConfig(version=1))
-        # weight shape: 1024 x 512
-        weight = l.weight
-
-        input = torch.randn(1, 512, device=device, dtype=dtype)
-        # make sure it runs
-        torch.nn.functional.linear(input, weight)
-
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantized)
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedBasic)

diff --git a/test/hqq/test_hqq_affine.py b/test/hqq/test_hqq_affine.py
@@ -8,13 +8,11 @@
 import torch
 
 from torchao.quantization import (
-    Int4WeightOnlyConfig,
     MappingType,
     UIntXWeightOnlyConfig,
     ZeroPointDomain,
     quantize_,
 )
-from torchao.testing.utils import skip_if_rocm
 
 cuda_available = torch.cuda.is_available()
 
@@ -54,12 +52,7 @@ def _eval_hqq(dtype):
         in_features=in_features, out_features=out_features, bias=False
     )
     dummy_linear.weight.data = W
-    if dtype == torch.uint4:
-        config = Int4WeightOnlyConfig(
-            group_size=max(block_size), use_hqq=True, version=1
-        )
-    else:
-        config = UIntXWeightOnlyConfig(dtype, group_size=max(block_size), use_hqq=True)
+    config = UIntXWeightOnlyConfig(dtype, group_size=max(block_size), use_hqq=True)
     quantize_(dummy_linear, config)
     q_tensor_hqq = dummy_linear.weight
 
@@ -113,14 +106,6 @@ def test_hqq_plain_5bit(self):
             ref_dot_product_error=0.000704,
         )
 
-    @skip_if_rocm("ROCm enablement in progress")
-    def test_hqq_plain_4bit(self):
-        self._test_hqq(
-            dtype=torch.uint4,
-            ref_dequantize_error=0.000487,
-            ref_dot_product_error=0.001472,
-        )
-
     def test_hqq_plain_3bit(self):
         self._test_hqq(
             dtype=torch.uint3,