Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 0 additions & 31 deletions benchmarks/microbenchmarks/test/test_benchmark_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,37 +49,6 @@ def test_run_inference(self, mock_string_to_config):
hasattr(result, "quantized_model_compiled_inference_time_in_ms")
)

@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
"""Test running inference with sparsity configurations"""
# Mock string_to_config to return valid configs
from torchao.dtypes import MarlinSparseLayout
from torchao.quantization import Int4WeightOnlyConfig

# Test with semi-sparse config
mock_string_to_config.return_value = Int4WeightOnlyConfig(
layout=MarlinSparseLayout(),
version=1,
)
config = BenchmarkConfig(
quantization="marlin",
sparsity="semi-sparse",
params={
"high_precision_dtype": "torch.float32",
"device": "cpu",
"model_type": "linear",
},
shape_name="custom",
shape=[64, 64, 64], # Use dimensions divisible by 64
output_dir=self.temp_dir,
benchmark_mode="inference",
)
result = run(config)
self.assertIsInstance(result, BenchmarkResult)
self.assertTrue(
hasattr(result, "quantized_model_compiled_inference_time_in_ms")
)

@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
def test_run_inference_with_block_sparsity(self, mock_string_to_config):
"""Test running inference with sparsity configurations"""
Expand Down
17 changes: 0 additions & 17 deletions benchmarks/microbenchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
Float8WeightOnlyConfig,
FPXWeightOnlyConfig,
GemliteUIntXWeightOnlyConfig,
Int4WeightOnlyConfig,
Int8DynamicActivationInt4WeightConfig,
Int8DynamicActivationInt8WeightConfig,
Int8WeightOnlyConfig,
Expand Down Expand Up @@ -195,18 +194,6 @@ def string_to_config(
return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True)
else:
return Int8DynamicActivationInt8WeightConfig()
if "int4wo" in quantization:
use_hqq = False
if "hqq" in quantization:
use_hqq = True
group_size = int(quantization.split("-")[1])
assert group_size in [
32,
64,
128,
256,
], f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq, version=1)
elif "int8adq-int4w-symm" in quantization:
from torchao.dtypes import CutlassInt4PackedLayout

Expand All @@ -226,10 +213,6 @@ def string_to_config(
act_mapping_type=MappingType.SYMMETRIC,
layout=MarlinQQQLayout(),
)
elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
from torchao.dtypes import MarlinSparseLayout

return Int4WeightOnlyConfig(layout=MarlinSparseLayout(), version=1)
if "fp6" in quantization:
return FPXWeightOnlyConfig(3, 2)
elif "uintx" in quantization:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/serialization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Here is the serialization and deserialization flow::
print(f"original model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")

example_inputs = m.example_inputs(dtype=dtype, device="cuda")
quantize_(m, Int4WeightOnlyConfig(version=1))
quantize_(m, Int4WeightOnlyConfig())
print(f"quantized model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")

ref = m(*example_inputs)
Expand Down
6 changes: 2 additions & 4 deletions docs/source/torchao_vllm_integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ from torchao.quantization import Int4WeightOnlyConfig
# Example configuration
config = Int4WeightOnlyConfig(
group_size=128,
use_hqq=True,
version=1,
)
assert isinstance(config, AOBaseConfig)
```
Expand All @@ -66,7 +64,7 @@ config = FqnToConfig({
"model.layers.0.self_attn.q_proj": Int4WeightOnlyConfig(group_size=64),
"model.layers.0.self_attn.k_proj": Int4WeightOnlyConfig(group_size=64),
"model.layers.0.mlp.gate_proj": Int8WeightOnlyConfig(),
"_default": Int4WeightOnlyConfig(group_size=128, version=1) # Default for other modules
"_default": Int4WeightOnlyConfig(group_size=128) # Default for other modules
})
```
(usage-examples)=
Expand All @@ -81,7 +79,7 @@ from torchao.quantization import Int4WeightOnlyConfig

# Create quantization configuration
quantization_config = TorchAoConfig(
quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1)
quant_type=Int4WeightOnlyConfig(group_size=128)
)

# Load and automatically quantize the model
Expand Down
3 changes: 1 addition & 2 deletions scripts/quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ def forward(self, x):
# | torchao quantization |
# ========================

# torch 2.4+ only
quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
quantize_(model, Int4WeightOnlyConfig(group_size=32))


# =============
Expand Down
111 changes: 4 additions & 107 deletions test/dtypes/test_affine_quantized.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
from torchao.core.config import AOBaseConfig
from torchao.dtypes import (
CutlassInt4PackedLayout,
Int4CPULayout,
Int4XPULayout,
PlainLayout,
SemiSparseLayout,
to_affine_quantized_intx,
Expand All @@ -28,14 +26,13 @@
Float8WeightOnlyConfig,
GemliteUIntXWeightOnlyConfig,
Int4DynamicActivationInt4WeightConfig,
Int4WeightOnlyConfig,
Int8DynamicActivationInt4WeightConfig,
Int8DynamicActivationInt8WeightConfig,
Int8WeightOnlyConfig,
quantize_,
)
from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
from torchao.testing.utils import skip_if_no_cuda, skip_if_no_gemlite, skip_if_rocm
from torchao.quantization.quant_primitives import MappingType
from torchao.testing.utils import skip_if_no_gemlite, skip_if_rocm
from torchao.utils import (
check_cpu_version,
check_xpu_version,
Expand All @@ -62,24 +59,10 @@ def get_quantization_functions(
]
if do_int4:
if check_cpu_version(device):
base_functions.append(
Int4WeightOnlyConfig(group_size=32, layout=Int4CPULayout(), version=1)
)
pass
elif check_xpu_version(device):
base_functions.append(
Int4WeightOnlyConfig(group_size=32, layout=Int4XPULayout(), version=1)
)
if int4_zp_int:
base_functions.append(
Int4WeightOnlyConfig(
group_size=32,
layout=Int4XPULayout(),
zero_point_domain=ZeroPointDomain.INT,
version=1,
)
)
pass
else:
base_functions.append(Int4WeightOnlyConfig(group_size=32, version=1))
if device == "cuda" and not is_ROCM():
base_functions.append(
Int8DynamicActivationInt4WeightConfig(
Expand Down Expand Up @@ -107,26 +90,6 @@ class TestAffineQuantized(TestCase):
["xpu"] if torch.xpu.is_available() else []
)

@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
def test_tensor_core_layout_transpose(self):
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
t = linear.weight
shape = t.shape
apply_int4_weight_only_quant = Int4WeightOnlyConfig(group_size=32, version=1)
quantize_(linear, apply_int4_weight_only_quant)
ql = linear
aqt = ql.weight
aqt_shape = aqt.shape
self.assertEqual(aqt_shape, shape)

# transpose shape test
for _ in range(10):
t = t.t()
aqt = aqt.t()
shape = t.shape
aqt_shape = aqt.shape
self.assertEqual(aqt_shape, shape)

@unittest.skipIf(len(GPU_DEVICES) == 0, "Need GPU available")
def test_weights_only(self):
for device in self.GPU_DEVICES:
Expand Down Expand Up @@ -338,20 +301,6 @@ def test_alias(self, device, dtype):
quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
_ = dummy.weight[...]

@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16])
@skip_if_no_cuda()
@skip_if_rocm("ROCm enablement in progress")
def test_slice_int4wo(self, device, dtype):
# in_feature not divisible by 1024
# out_feature not divisible by 8
# to test slice + padding for int4 weight only quantization
dummy = nn.Linear(256, 321, dtype=dtype, device=device)
quantize_(dummy, Int4WeightOnlyConfig(version=1))
# make sure these run without error
_ = dummy.weight.narrow(0, 0, 64)
_ = dummy.weight.narrow(1, 0, 128)

@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
Expand Down Expand Up @@ -452,58 +401,6 @@ def test_matmul(self, device, dtype):
# make sure it runs
torch.matmul(x, w.t())

@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16])
@skip_if_no_cuda()
@skip_if_rocm("ROCm enablement in progress")
def test_slice_and_copy_int4wo(self, device, dtype):
l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
l.weight = torch.nn.Parameter(
torch.zeros(1024, 1024, dtype=torch.bfloat16, device=_DEVICE)
)
quantize_(l, Int4WeightOnlyConfig(version=1))
param = l.weight
param_data = param.data
param_data = param_data.narrow(0, 0, 512)
assert (
param.data.tensor_impl.packed_weight.data_ptr()
== param_data.tensor_impl.packed_weight.data_ptr()
)
assert (
param.data.tensor_impl.scale_and_zero.data_ptr()
== param_data.tensor_impl.scale_and_zero.data_ptr()
)
assert param.data.dequantize()[0][0] == 0

# dummy_l has random input (shouldn't be 0)
dummy_l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
quantized = dummy_l.weight
quantized = quantized.narrow(0, 0, 512)

param_data.copy_(quantized)

# making sure param.data is updated
assert param.data.dequantize()[0][0] != 0

@common_utils.parametrize("device", [_DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16])
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
@skip_if_rocm("ROCm enablement in progress")
def test_mm_int4wo(self, device, dtype):
weight = torch.randn(512, 1024).to(device).to(dtype)
weight = weight.t()

l = torch.nn.Linear(512, 1024).to(device).to(dtype)
l.weight = torch.nn.Parameter(weight)
quantize_(l, Int4WeightOnlyConfig(version=1))
# weight shape: 1024 x 512
weight = l.weight

input = torch.randn(1, 512, device=device, dtype=dtype)
# make sure it runs
torch.nn.functional.linear(input, weight)


common_utils.instantiate_parametrized_tests(TestAffineQuantized)
common_utils.instantiate_parametrized_tests(TestAffineQuantizedBasic)
Expand Down
17 changes: 1 addition & 16 deletions test/hqq/test_hqq_affine.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
import torch

from torchao.quantization import (
Int4WeightOnlyConfig,
MappingType,
UIntXWeightOnlyConfig,
ZeroPointDomain,
quantize_,
)
from torchao.testing.utils import skip_if_rocm

cuda_available = torch.cuda.is_available()

Expand Down Expand Up @@ -54,12 +52,7 @@ def _eval_hqq(dtype):
in_features=in_features, out_features=out_features, bias=False
)
dummy_linear.weight.data = W
if dtype == torch.uint4:
config = Int4WeightOnlyConfig(
group_size=max(block_size), use_hqq=True, version=1
)
else:
config = UIntXWeightOnlyConfig(dtype, group_size=max(block_size), use_hqq=True)
config = UIntXWeightOnlyConfig(dtype, group_size=max(block_size), use_hqq=True)
quantize_(dummy_linear, config)
q_tensor_hqq = dummy_linear.weight

Expand Down Expand Up @@ -113,14 +106,6 @@ def test_hqq_plain_5bit(self):
ref_dot_product_error=0.000704,
)

@skip_if_rocm("ROCm enablement in progress")
def test_hqq_plain_4bit(self):
self._test_hqq(
dtype=torch.uint4,
ref_dequantize_error=0.000487,
ref_dot_product_error=0.001472,
)

def test_hqq_plain_3bit(self):
self._test_hqq(
dtype=torch.uint3,
Expand Down
Loading
Loading