diff --git a/cli/alora/train.py b/cli/alora/train.py index aaeb52b6..408864ca 100644 --- a/cli/alora/train.py +++ b/cli/alora/train.py @@ -133,6 +133,13 @@ def train_model( model_base = AutoModelForCausalLM.from_pretrained( base_model, device_map=device_map, use_cache=False ) + + # `fp16=True` enables CUDA-specific mixed precision via GradScaler, which doesn't function properly on cpu or mps. + # Check all the model's parameters to ensure it's okay to use. + use_fp16 = all( + param.device.type != "cpu" and param.device.type != "mps" + for param in model_base.parameters() + ) except NotImplementedError as e: if "meta tensor" in str(e): raise RuntimeError( @@ -176,7 +183,7 @@ def train_model( max_seq_length=max_length, per_device_train_batch_size=batch_size, gradient_accumulation_steps=grad_accum, - fp16=True, + fp16=use_fp16, ) trainer = SafeSaveTrainer( @@ -210,7 +217,7 @@ def train_model( max_seq_length=max_length, per_device_train_batch_size=batch_size, gradient_accumulation_steps=grad_accum, - fp16=True, + fp16=use_fp16, ) trainer = SafeSaveTrainer( diff --git a/mellea/backends/cache.py b/mellea/backends/cache.py index eb7791ef..e827cf5a 100644 --- a/mellea/backends/cache.py +++ b/mellea/backends/cache.py @@ -60,6 +60,9 @@ def get(self, key: str | int) -> Any | None: def put(self, key: str | int, value: Any): """Put a value into the cache.""" + if self.capacity == 0: + return + if key in self.cache: # If the key exists, move it to the end (most recent) self.cache.pop(key) diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py index 2a355264..76ba1e5b 100644 --- a/mellea/backends/model_ids.py +++ b/mellea/backends/model_ids.py @@ -79,7 +79,7 @@ class ModelIdentifier: IBM_GRANITE_4_MICRO_3B = ModelIdentifier( hf_model_name="ibm-granite/granite-4.0-micro", ollama_name="granite4:micro", - watsonx_name="ibm/granite-4-small", + watsonx_name="ibm/granite-4-h-small", # Keeping hybrid version here for backwards compatibility. ) # Granite 3.3 Vision Model (2B) diff --git a/test/cli/test_alora_train.py b/test/cli/test_alora_train.py index 75370cad..7a1c0a35 100644 --- a/test/cli/test_alora_train.py +++ b/test/cli/test_alora_train.py @@ -28,6 +28,9 @@ def test_alora_config_creation(): mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer mock_model = Mock() + mock_param = Mock() + mock_param.device.type = "cuda" + mock_model.parameters.return_value = [mock_param] mock_model_class.from_pretrained.return_value = mock_model mock_peft_model = Mock() @@ -102,6 +105,9 @@ def test_lora_config_creation(): mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer mock_model = Mock() + mock_param = Mock() + mock_param.device.type = "cuda" + mock_model.parameters.return_value = [mock_param] mock_model_class.from_pretrained.return_value = mock_model mock_peft_model = Mock() @@ -175,7 +181,11 @@ def test_invocation_prompt_tokenization(): mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer # Setup other mocks - mock_model_class.from_pretrained.return_value = Mock() + mock_model = Mock() + mock_param = Mock() + mock_param.device.type = "cuda" + mock_model.parameters.return_value = [mock_param] + mock_model_class.from_pretrained.return_value = mock_model mock_get_peft_model.return_value = Mock() mock_ds = MagicMock()