updated loading in llama 2 demo to use transformer bridge (#1019)

degenfabian · bryce13950 · jlarson4 · web-flow · commit cae8f3ffb56c · 2026-04-06T11:35:07.000-05:00
* updated loading in llama 2 demo to use transformer bridge

* Updating LLaMA quantized model

---------

Co-authored-by: Bryce Meyer &lt;bryce13950@gmail.com&gt;
Co-authored-by: jlarson4 &lt;jonahalarson@comcast.net&gt;
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -235,7 +235,7 @@ jobs:
           # - "Head_Detector_Demo"
           # - "Interactive_Neuroscope"
           # - "LLaMA"
-          # - "LLaMA2_GPU_Quantized"
+          # - "LLaMA2_GPU_Quantized"  # Requires quantization libs + too slow for CI timeout
           - "Main_Demo"
           # - "No_Position_Experiment"
           - "Othello_GPT"
diff --git a/demos/LLaMA2_GPU_Quantized.ipynb b/demos/LLaMA2_GPU_Quantized.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@
         "tabulate>=0.9.0",
     ]
     jupyter=["ipywidgets>=8.1.1", "jupyterlab>=3.5.0"]
+    quantization=["bitsandbytes>=0.46.1", "optimum-quanto>=0.2.7"]
 
 [tool.poetry.dependencies]
     accelerate=">=0.23.0"                                                                            # Needed for Llama Models
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
@@ -149,6 +149,7 @@ def boot_transformers(
         load_weights: bool = True,
         trust_remote_code: bool = False,
         model_class: Optional[type] = None,
+        hf_model: Optional[Any] = None,
     ) -> "TransformerBridge":
         """Boot a model from HuggingFace (alias for sources.transformers.boot).
 
@@ -162,6 +163,9 @@ def boot_transformers(
             trust_remote_code: Whether to trust remote code for custom model architectures.
             model_class: Optional HuggingFace model class to use instead of the default
                 auto-detected class (e.g., BertForNextSentencePrediction).
+            hf_model: Optional pre-loaded HuggingFace model to use instead of loading one. Useful
+                for models loaded with custom configurations (e.g., quantization via
+                BitsAndBytesConfig). When provided, load_weights is ignored.
 
         Returns:
             The bridge to the loaded model.
@@ -177,6 +181,7 @@ def boot_transformers(
             load_weights=load_weights,
             trust_remote_code=trust_remote_code,
             model_class=model_class,
+            hf_model=hf_model,
         )
 
     @property
diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py
@@ -270,6 +270,7 @@ def boot(
     load_weights: bool = True,
     trust_remote_code: bool = False,
     model_class: Any | None = None,
+    hf_model: Any | None = None,
 ) -> TransformerBridge:
     """Boot a model from HuggingFace.
 
@@ -283,6 +284,9 @@ def boot(
         model_class: Optional HuggingFace model class to use instead of the default auto-detected
             class. When the class name matches a key in SUPPORTED_ARCHITECTURES, the corresponding
             adapter is selected automatically (e.g., BertForNextSentencePrediction).
+        hf_model: Optional pre-loaded HuggingFace model to use instead of loading one. Useful for
+            models loaded with custom configurations (e.g., quantization via BitsAndBytesConfig).
+            When provided, load_weights is ignored.
 
     Returns:
         The bridge to the loaded model.
@@ -368,7 +372,10 @@ def boot(
         # Default to eager (required for output_attentions hooks)
         model_kwargs["attn_implementation"] = "eager"
     adapter.prepare_loading(model_name, model_kwargs)
-    if not load_weights:
+    if hf_model is not None:
+        # Use the pre-loaded model as-is (e.g., quantized models with custom device_map)
+        pass
+    elif not load_weights:
         from_config_kwargs = {}
         if trust_remote_code:
             from_config_kwargs["trust_remote_code"] = True
diff --git a/transformer_lens/supported_models.py b/transformer_lens/supported_models.py
@@ -274,12 +274,9 @@
     "bigscience/bloom-3b": ["bloom-3b"],
     "bigscience/bloom-560m": ["bloom-560m"],
     "bigscience/bloom-7b1": ["bloom-7b1"],
-    "codellama/CodeLlama-7b-hf": ["CodeLlamallama-2-7b", "codellama/CodeLlama-7b-hf"],
-    "codellama/CodeLlama-7b-Instruct-hf": [
-        "CodeLlama-7b-instruct",
-        "codellama/CodeLlama-7b-Instruct-hf",
-    ],
-    "codellama/CodeLlama-7b-Python-hf": ["CodeLlama-7b-python", "codellama/CodeLlama-7b-Python-hf"],
+    "codellama/CodeLlama-7b-hf": ["CodeLlamallama-2-7b"],
+    "codellama/CodeLlama-7b-Instruct-hf": ["CodeLlama-7b-instruct"],
+    "codellama/CodeLlama-7b-Python-hf": ["CodeLlama-7b-python"],
     "distilgpt2": ["distillgpt2", "distill-gpt2", "distil-gpt2", "gpt2-xs"],
     "EleutherAI/gpt-j-6B": ["gpt-j-6B", "gpt-j", "gptj"],
     "EleutherAI/gpt-neo-1.3B": ["gpt-neo-1.3B", "gpt-neo-medium", "neo-medium"],
@@ -404,16 +401,16 @@
         "EleutherAI/pythia-19m-v0",
         "pythia-19m-v0",
     ],
-    "facebook/hubert-base-ls960": ["facebook/hubert-base-ls960", "hubert-base-ls960"],
+    "facebook/hubert-base-ls960": ["hubert-base-ls960"],
     "facebook/opt-1.3b": ["opt-1.3b", "opt-medium"],
     "facebook/opt-125m": ["opt-125m", "opt-small", "opt"],
     "facebook/opt-13b": ["opt-13b", "opt-xxl"],
     "facebook/opt-2.7b": ["opt-2.7b", "opt-large"],
     "facebook/opt-30b": ["opt-30b", "opt-xxxl"],
     "facebook/opt-6.7b": ["opt-6.7b", "opt-xl"],
     "facebook/opt-66b": ["opt-66b", "opt-xxxxl"],
-    "facebook/wav2vec2-base": ["facebook/wav2vec2-base", "wav2vec2-base", "w2v2-base"],
-    "facebook/wav2vec2-large": ["facebook/wav2vec2-large", "wav2vec2-large", "w2v2-large"],
+    "facebook/wav2vec2-base": ["wav2vec2-base", "w2v2-base"],
+    "facebook/wav2vec2-large": ["wav2vec2-large", "w2v2-large"],
     "google-bert/bert-base-cased": ["bert-base-cased"],
     "google-bert/bert-base-uncased": ["bert-base-uncased"],
     "google-bert/bert-large-cased": ["bert-large-cased"],
@@ -450,11 +447,11 @@
     "llama-30b-hf": ["llama-30b"],
     "llama-65b-hf": ["llama-65b"],
     "llama-7b-hf": ["llama-7b"],
-    "meta-llama/Llama-2-13b-chat-hf": ["Llama-2-13b-chat", "meta-llama/Llama-2-13b-chat-hf"],
-    "meta-llama/Llama-2-13b-hf": ["Llama-2-13b", "meta-llama/Llama-2-13b-hf"],
+    "meta-llama/Llama-2-13b-chat-hf": ["Llama-2-13b-chat"],
+    "meta-llama/Llama-2-13b-hf": ["Llama-2-13b"],
     "meta-llama/Llama-2-70b-chat-hf": ["Llama-2-70b-chat", "meta-llama-2-70b-chat-hf"],
-    "meta-llama/Llama-2-7b-chat-hf": ["Llama-2-7b-chat", "meta-llama/Llama-2-7b-chat-hf"],
-    "meta-llama/Llama-2-7b-hf": ["Llama-2-7b", "meta-llama/Llama-2-7b-hf"],
+    "meta-llama/Llama-2-7b-chat-hf": ["Llama-2-7b-chat"],
+    "meta-llama/Llama-2-7b-hf": ["Llama-2-7b"],
     "microsoft/phi-1": ["phi-1"],
     "microsoft/phi-1_5": ["phi-1_5"],
     "microsoft/phi-2": ["phi-2"],
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,7 @@`
`63`	`63`	`"tabulate>=0.9.0",`
`64`	`64`	`]`
`65`	`65`	`jupyter=["ipywidgets>=8.1.1", "jupyterlab>=3.5.0"]`
	`66`	`+ quantization=["bitsandbytes>=0.46.1", "optimum-quanto>=0.2.7"]`
`66`	`67`
`67`	`68`	`[tool.poetry.dependencies]`
`68`	`69`	`accelerate=">=0.23.0" # Needed for Llama Models`