generative-computing · jakelorocco · Feb 26, 2026 · Feb 25, 2026
@@ -133,6 +133,13 @@ def train_model(
         model_base = AutoModelForCausalLM.from_pretrained(
             base_model, device_map=device_map, use_cache=False
         )
+
+        # `fp16=True` enables CUDA-specific mixed precision via GradScaler, which doesn't function properly on cpu or mps.
+        # Check all the model's parameters to ensure it's okay to use.
+        use_fp16 = all(
+            param.device.type != "cpu" and param.device.type != "mps"
+            for param in model_base.parameters()
+        )
     except NotImplementedError as e:
         if "meta tensor" in str(e):
             raise RuntimeError(
@@ -176,7 +183,7 @@ def train_model(
             max_seq_length=max_length,
             per_device_train_batch_size=batch_size,
             gradient_accumulation_steps=grad_accum,
-            fp16=True,
+            fp16=use_fp16,
         )
 
         trainer = SafeSaveTrainer(
@@ -210,7 +217,7 @@ def train_model(
             max_seq_length=max_length,
             per_device_train_batch_size=batch_size,
             gradient_accumulation_steps=grad_accum,
-            fp16=True,
+            fp16=use_fp16,
         )
 
         trainer = SafeSaveTrainer(

@@ -60,6 +60,9 @@ def get(self, key: str | int) -> Any | None:
 
     def put(self, key: str | int, value: Any):
         """Put a value into the cache."""
+        if self.capacity == 0:
+            return
+
         if key in self.cache:
             # If the key exists, move it to the end (most recent)
             self.cache.pop(key)

@@ -79,7 +79,7 @@ class ModelIdentifier:
 IBM_GRANITE_4_MICRO_3B = ModelIdentifier(
     hf_model_name="ibm-granite/granite-4.0-micro",
     ollama_name="granite4:micro",
-    watsonx_name="ibm/granite-4-small",
+    watsonx_name="ibm/granite-4-h-small",  # Keeping hybrid version here for backwards compatibility.
 )
 
 # Granite 3.3 Vision Model (2B)

@@ -28,6 +28,9 @@ def test_alora_config_creation():
         mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
 
         mock_model = Mock()
+        mock_param = Mock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = [mock_param]
         mock_model_class.from_pretrained.return_value = mock_model
 
         mock_peft_model = Mock()
@@ -102,6 +105,9 @@ def test_lora_config_creation():
         mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
 
         mock_model = Mock()
+        mock_param = Mock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = [mock_param]
         mock_model_class.from_pretrained.return_value = mock_model
 
         mock_peft_model = Mock()
@@ -175,7 +181,11 @@ def test_invocation_prompt_tokenization():
         mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
 
         # Setup other mocks
-        mock_model_class.from_pretrained.return_value = Mock()
+        mock_model = Mock()
+        mock_param = Mock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = [mock_param]
+        mock_model_class.from_pretrained.return_value = mock_model
         mock_get_peft_model.return_value = Mock()
 
         mock_ds = MagicMock()