add test_int8_tensor.py

zxd1997066 · zxd1997066 · commit 669725393043 · 2025-12-19T14:03:05.000+08:00
diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -26,7 +26,7 @@
 from torchao.quantization.utils import compute_error, get_block_size
 from torchao.testing.model_architectures import ToyTwoLinearModel
 from torchao.testing.utils import TorchAOIntegrationTestCase
-from torchao.utils import torch_version_at_least
+from torchao.utils import get_current_accelerator_device, torch_version_at_least
 
 INT8_TEST_CONFIGS = [
     Int8WeightOnlyConfig(version=2, granularity=PerTensor()),
@@ -38,9 +38,10 @@
         version=2, granularity=PerRow(), act_mapping_type=MappingType.SYMMETRIC
     ),
 ]
+_DEVICE = get_current_accelerator_device()
 
 
-@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
 @common_utils.instantiate_parametrized_tests
 class TestInt8Tensor(TorchAOIntegrationTestCase):
     def setUp(self):
@@ -60,7 +61,7 @@ def test_creation_and_attributes(self, config):
             self.test_shape[0],
             bias=False,
             dtype=self.dtype,
-            device="cuda",
+            device=_DEVICE,
         )
         quantize_(linear, config)
 
@@ -99,8 +100,8 @@ def test_int8_linear_variants(
         torch.compiler.reset()
 
         M, N, K = sizes
-        input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
-        model = ToyTwoLinearModel(K, N, K, dtype=dtype, device="cuda").eval()
+        input_tensor = torch.randn(*M, K, dtype=dtype, device=_DEVICE)
+        model = ToyTwoLinearModel(K, N, K, dtype=dtype, device=_DEVICE).eval()
         model_q = copy.deepcopy(model)
 
         quantize_(model_q, config)
@@ -128,7 +129,7 @@ def test_int8_linear_variants(
         )
 
     @common_utils.parametrize("config", INT8_TEST_CONFIGS)
-    @common_utils.parametrize("device", ["cpu", "cuda"])
+    @common_utils.parametrize("device", ["cpu", _DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
     def test_slice(self, config, device, dtype):
         """Test tensor slicing with per-row quantization"""
@@ -159,8 +160,8 @@ def test_slice(self, config, device, dtype):
     def test_index_select(self, config):
         """test that `x_0 = x[0]` works when `x` is a 2D quantized tensor."""
         N, K = 256, 512
-        x = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
-        linear = torch.nn.Linear(K, N, bias=False, dtype=torch.bfloat16, device="cuda")
+        x = torch.randn(N, K, device=_DEVICE, dtype=torch.bfloat16)
+        linear = torch.nn.Linear(K, N, bias=False, dtype=torch.bfloat16, device=_DEVICE)
         linear.weight.data = x
 
         quantize_(linear, config)
@@ -187,7 +188,7 @@ def test_index_select(self, config):
     def test_dequantization_accuracy(self, config):
         """Test dequantization accuracy separately"""
         linear = torch.nn.Linear(
-            256, 512, bias=False, dtype=torch.bfloat16, device="cuda"
+            256, 512, bias=False, dtype=torch.bfloat16, device=_DEVICE
         )
         weight_fp = copy.deepcopy(linear.weight)
         quantize_(linear, config)
@@ -208,14 +209,14 @@ def test_available_gpu_kernels(self):
 
         M, K, N = 128, 256, 512
         m = torch.nn.Sequential(
-            torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
+            torch.nn.Linear(K, N, device=_DEVICE, dtype=torch.bfloat16)
         )
 
         config = Int8DynamicActivationInt8WeightConfig(version=2)
         quantize_(m, config)
 
         m = torch.compile(m)
-        x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+        x = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
 
         out, code = run_and_get_code(m, x)
 
@@ -248,7 +249,7 @@ def test_pin_memory(self, config):
         )
 
 
-@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
 @common_utils.instantiate_parametrized_tests
 class TestInt8StaticQuant(TorchAOIntegrationTestCase):
     @common_utils.parametrize("granularity", [PerRow(), PerTensor()])
@@ -257,9 +258,9 @@ def test_static_activation_per_row_int8_weight(self, granularity, dtype):
         torch.compiler.reset()
 
         M, N, K = 32, 32, 32
-        input_tensor = torch.randn(M, K, dtype=dtype, device="cuda")
+        input_tensor = torch.randn(M, K, dtype=dtype, device=_DEVICE)
 
-        model = torch.nn.Linear(K, N, bias=False).eval().to(device="cuda", dtype=dtype)
+        model = torch.nn.Linear(K, N, bias=False).eval().to(device=_DEVICE, dtype=dtype)
         model_static_quant = copy.deepcopy(model)
         model_dynamic_quant = copy.deepcopy(model)