Skip to content

Commit d4d1277

Browse files
committed
deprecate v1 of Int4WeightOnlyConfig
Summary: deprecate v1 of `Int4WeightOnlyConfig` and delete all callsites Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: c5b1edd ghstack-comment-id: 3670759710 Pull-Request: #3513
1 parent a3ec981 commit d4d1277

File tree

25 files changed

+60
-891
lines changed

25 files changed

+60
-891
lines changed

benchmarks/microbenchmarks/test/test_benchmark_inference.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -49,37 +49,6 @@ def test_run_inference(self, mock_string_to_config):
4949
hasattr(result, "quantized_model_compiled_inference_time_in_ms")
5050
)
5151

52-
@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
53-
def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
54-
"""Test running inference with sparsity configurations"""
55-
# Mock string_to_config to return valid configs
56-
from torchao.dtypes import MarlinSparseLayout
57-
from torchao.quantization import Int4WeightOnlyConfig
58-
59-
# Test with semi-sparse config
60-
mock_string_to_config.return_value = Int4WeightOnlyConfig(
61-
layout=MarlinSparseLayout(),
62-
version=1,
63-
)
64-
config = BenchmarkConfig(
65-
quantization="marlin",
66-
sparsity="semi-sparse",
67-
params={
68-
"high_precision_dtype": "torch.float32",
69-
"device": "cpu",
70-
"model_type": "linear",
71-
},
72-
shape_name="custom",
73-
shape=[64, 64, 64], # Use dimensions divisible by 64
74-
output_dir=self.temp_dir,
75-
benchmark_mode="inference",
76-
)
77-
result = run(config)
78-
self.assertIsInstance(result, BenchmarkResult)
79-
self.assertTrue(
80-
hasattr(result, "quantized_model_compiled_inference_time_in_ms")
81-
)
82-
8352
@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
8453
def test_run_inference_with_block_sparsity(self, mock_string_to_config):
8554
"""Test running inference with sparsity configurations"""

benchmarks/microbenchmarks/utils.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
Float8WeightOnlyConfig,
2020
FPXWeightOnlyConfig,
2121
GemliteUIntXWeightOnlyConfig,
22-
Int4WeightOnlyConfig,
2322
Int8DynamicActivationInt4WeightConfig,
2423
Int8DynamicActivationInt8WeightConfig,
2524
Int8WeightOnlyConfig,
@@ -195,18 +194,6 @@ def string_to_config(
195194
return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True)
196195
else:
197196
return Int8DynamicActivationInt8WeightConfig()
198-
if "int4wo" in quantization:
199-
use_hqq = False
200-
if "hqq" in quantization:
201-
use_hqq = True
202-
group_size = int(quantization.split("-")[1])
203-
assert group_size in [
204-
32,
205-
64,
206-
128,
207-
256,
208-
], f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
209-
return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq, version=1)
210197
elif "int8adq-int4w-symm" in quantization:
211198
from torchao.dtypes import CutlassInt4PackedLayout
212199

@@ -226,10 +213,6 @@ def string_to_config(
226213
act_mapping_type=MappingType.SYMMETRIC,
227214
layout=MarlinQQQLayout(),
228215
)
229-
elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
230-
from torchao.dtypes import MarlinSparseLayout
231-
232-
return Int4WeightOnlyConfig(layout=MarlinSparseLayout(), version=1)
233216
if "fp6" in quantization:
234217
return FPXWeightOnlyConfig(3, 2)
235218
elif "uintx" in quantization:

docs/source/serialization.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ Here is the serialization and deserialization flow::
3636
print(f"original model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")
3737

3838
example_inputs = m.example_inputs(dtype=dtype, device="cuda")
39-
quantize_(m, Int4WeightOnlyConfig(version=1))
39+
quantize_(m, Int4WeightOnlyConfig())
4040
print(f"quantized model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")
4141

4242
ref = m(*example_inputs)

docs/source/torchao_vllm_integration.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@ from torchao.quantization import Int4WeightOnlyConfig
4444
# Example configuration
4545
config = Int4WeightOnlyConfig(
4646
group_size=128,
47-
use_hqq=True,
48-
version=1,
4947
)
5048
assert isinstance(config, AOBaseConfig)
5149
```
@@ -66,7 +64,7 @@ config = FqnToConfig({
6664
"model.layers.0.self_attn.q_proj": Int4WeightOnlyConfig(group_size=64),
6765
"model.layers.0.self_attn.k_proj": Int4WeightOnlyConfig(group_size=64),
6866
"model.layers.0.mlp.gate_proj": Int8WeightOnlyConfig(),
69-
"_default": Int4WeightOnlyConfig(group_size=128, version=1) # Default for other modules
67+
"_default": Int4WeightOnlyConfig(group_size=128) # Default for other modules
7068
})
7169
```
7270
(usage-examples)=
@@ -81,7 +79,7 @@ from torchao.quantization import Int4WeightOnlyConfig
8179

8280
# Create quantization configuration
8381
quantization_config = TorchAoConfig(
84-
quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1)
82+
quant_type=Int4WeightOnlyConfig(group_size=128)
8583
)
8684

8785
# Load and automatically quantize the model

scripts/quick_start.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ def forward(self, x):
3838
# | torchao quantization |
3939
# ========================
4040

41-
# torch 2.4+ only
42-
quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
41+
quantize_(model, Int4WeightOnlyConfig(group_size=32))
4342

4443

4544
# =============

test/dtypes/test_affine_quantized.py

Lines changed: 4 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
from torchao.core.config import AOBaseConfig
1818
from torchao.dtypes import (
1919
CutlassInt4PackedLayout,
20-
Int4CPULayout,
21-
Int4XPULayout,
2220
PlainLayout,
2321
SemiSparseLayout,
2422
to_affine_quantized_intx,
@@ -28,14 +26,13 @@
2826
Float8WeightOnlyConfig,
2927
GemliteUIntXWeightOnlyConfig,
3028
Int4DynamicActivationInt4WeightConfig,
31-
Int4WeightOnlyConfig,
3229
Int8DynamicActivationInt4WeightConfig,
3330
Int8DynamicActivationInt8WeightConfig,
3431
Int8WeightOnlyConfig,
3532
quantize_,
3633
)
37-
from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
38-
from torchao.testing.utils import skip_if_no_cuda, skip_if_no_gemlite, skip_if_rocm
34+
from torchao.quantization.quant_primitives import MappingType
35+
from torchao.testing.utils import skip_if_no_gemlite, skip_if_rocm
3936
from torchao.utils import (
4037
check_cpu_version,
4138
check_xpu_version,
@@ -62,24 +59,10 @@ def get_quantization_functions(
6259
]
6360
if do_int4:
6461
if check_cpu_version(device):
65-
base_functions.append(
66-
Int4WeightOnlyConfig(group_size=32, layout=Int4CPULayout(), version=1)
67-
)
62+
pass
6863
elif check_xpu_version(device):
69-
base_functions.append(
70-
Int4WeightOnlyConfig(group_size=32, layout=Int4XPULayout(), version=1)
71-
)
72-
if int4_zp_int:
73-
base_functions.append(
74-
Int4WeightOnlyConfig(
75-
group_size=32,
76-
layout=Int4XPULayout(),
77-
zero_point_domain=ZeroPointDomain.INT,
78-
version=1,
79-
)
80-
)
64+
pass
8165
else:
82-
base_functions.append(Int4WeightOnlyConfig(group_size=32, version=1))
8366
if device == "cuda" and not is_ROCM():
8467
base_functions.append(
8568
Int8DynamicActivationInt4WeightConfig(
@@ -107,26 +90,6 @@ class TestAffineQuantized(TestCase):
10790
["xpu"] if torch.xpu.is_available() else []
10891
)
10992

110-
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
111-
def test_tensor_core_layout_transpose(self):
112-
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
113-
t = linear.weight
114-
shape = t.shape
115-
apply_int4_weight_only_quant = Int4WeightOnlyConfig(group_size=32, version=1)
116-
quantize_(linear, apply_int4_weight_only_quant)
117-
ql = linear
118-
aqt = ql.weight
119-
aqt_shape = aqt.shape
120-
self.assertEqual(aqt_shape, shape)
121-
122-
# transpose shape test
123-
for _ in range(10):
124-
t = t.t()
125-
aqt = aqt.t()
126-
shape = t.shape
127-
aqt_shape = aqt.shape
128-
self.assertEqual(aqt_shape, shape)
129-
13093
@unittest.skipIf(len(GPU_DEVICES) == 0, "Need GPU available")
13194
def test_weights_only(self):
13295
for device in self.GPU_DEVICES:
@@ -338,20 +301,6 @@ def test_alias(self, device, dtype):
338301
quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
339302
_ = dummy.weight[...]
340303

341-
@common_utils.parametrize("device", [_DEVICE])
342-
@common_utils.parametrize("dtype", [torch.bfloat16])
343-
@skip_if_no_cuda()
344-
@skip_if_rocm("ROCm enablement in progress")
345-
def test_slice_int4wo(self, device, dtype):
346-
# in_feature not divisible by 1024
347-
# out_feature not divisible by 8
348-
# to test slice + padding for int4 weight only quantization
349-
dummy = nn.Linear(256, 321, dtype=dtype, device=device)
350-
quantize_(dummy, Int4WeightOnlyConfig(version=1))
351-
# make sure these run without error
352-
_ = dummy.weight.narrow(0, 0, 64)
353-
_ = dummy.weight.narrow(1, 0, 128)
354-
355304
@common_utils.parametrize("device", [_DEVICE])
356305
@common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
357306
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
@@ -452,58 +401,6 @@ def test_matmul(self, device, dtype):
452401
# make sure it runs
453402
torch.matmul(x, w.t())
454403

455-
@common_utils.parametrize("device", [_DEVICE])
456-
@common_utils.parametrize("dtype", [torch.bfloat16])
457-
@skip_if_no_cuda()
458-
@skip_if_rocm("ROCm enablement in progress")
459-
def test_slice_and_copy_int4wo(self, device, dtype):
460-
l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
461-
l.weight = torch.nn.Parameter(
462-
torch.zeros(1024, 1024, dtype=torch.bfloat16, device=_DEVICE)
463-
)
464-
quantize_(l, Int4WeightOnlyConfig(version=1))
465-
param = l.weight
466-
param_data = param.data
467-
param_data = param_data.narrow(0, 0, 512)
468-
assert (
469-
param.data.tensor_impl.packed_weight.data_ptr()
470-
== param_data.tensor_impl.packed_weight.data_ptr()
471-
)
472-
assert (
473-
param.data.tensor_impl.scale_and_zero.data_ptr()
474-
== param_data.tensor_impl.scale_and_zero.data_ptr()
475-
)
476-
assert param.data.dequantize()[0][0] == 0
477-
478-
# dummy_l has random input (shouldn't be 0)
479-
dummy_l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
480-
quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
481-
quantized = dummy_l.weight
482-
quantized = quantized.narrow(0, 0, 512)
483-
484-
param_data.copy_(quantized)
485-
486-
# making sure param.data is updated
487-
assert param.data.dequantize()[0][0] != 0
488-
489-
@common_utils.parametrize("device", [_DEVICE])
490-
@common_utils.parametrize("dtype", [torch.bfloat16])
491-
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
492-
@skip_if_rocm("ROCm enablement in progress")
493-
def test_mm_int4wo(self, device, dtype):
494-
weight = torch.randn(512, 1024).to(device).to(dtype)
495-
weight = weight.t()
496-
497-
l = torch.nn.Linear(512, 1024).to(device).to(dtype)
498-
l.weight = torch.nn.Parameter(weight)
499-
quantize_(l, Int4WeightOnlyConfig(version=1))
500-
# weight shape: 1024 x 512
501-
weight = l.weight
502-
503-
input = torch.randn(1, 512, device=device, dtype=dtype)
504-
# make sure it runs
505-
torch.nn.functional.linear(input, weight)
506-
507404

508405
common_utils.instantiate_parametrized_tests(TestAffineQuantized)
509406
common_utils.instantiate_parametrized_tests(TestAffineQuantizedBasic)

test/hqq/test_hqq_affine.py

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,11 @@
88
import torch
99

1010
from torchao.quantization import (
11-
Int4WeightOnlyConfig,
1211
MappingType,
1312
UIntXWeightOnlyConfig,
1413
ZeroPointDomain,
1514
quantize_,
1615
)
17-
from torchao.testing.utils import skip_if_rocm
1816

1917
cuda_available = torch.cuda.is_available()
2018

@@ -54,12 +52,7 @@ def _eval_hqq(dtype):
5452
in_features=in_features, out_features=out_features, bias=False
5553
)
5654
dummy_linear.weight.data = W
57-
if dtype == torch.uint4:
58-
config = Int4WeightOnlyConfig(
59-
group_size=max(block_size), use_hqq=True, version=1
60-
)
61-
else:
62-
config = UIntXWeightOnlyConfig(dtype, group_size=max(block_size), use_hqq=True)
55+
config = UIntXWeightOnlyConfig(dtype, group_size=max(block_size), use_hqq=True)
6356
quantize_(dummy_linear, config)
6457
q_tensor_hqq = dummy_linear.weight
6558

@@ -113,14 +106,6 @@ def test_hqq_plain_5bit(self):
113106
ref_dot_product_error=0.000704,
114107
)
115108

116-
@skip_if_rocm("ROCm enablement in progress")
117-
def test_hqq_plain_4bit(self):
118-
self._test_hqq(
119-
dtype=torch.uint4,
120-
ref_dequantize_error=0.000487,
121-
ref_dot_product_error=0.001472,
122-
)
123-
124109
def test_hqq_plain_3bit(self):
125110
self._test_hqq(
126111
dtype=torch.uint3,

0 commit comments

Comments
 (0)