From e98c0996e05954bf9e9f0514ee9d810fe780e8ec Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Thu, 7 Aug 2025 21:12:25 -0400
Subject: [PATCH 01/16] Testing scheduling and sampling.

---
 src/streamdiffusion/config.py                 |   2 +
 src/streamdiffusion/pipeline.py               | 438 ++++++++++++++++--
 .../stream_parameter_updater.py               |   3 +-
 src/streamdiffusion/wrapper.py                |  32 ++
 4 files changed, 430 insertions(+), 45 deletions(-)

diff --git a/src/streamdiffusion/config.py b/src/streamdiffusion/config.py
index 74f12931..19dfeba0 100644
--- a/src/streamdiffusion/config.py
+++ b/src/streamdiffusion/config.py
@@ -128,6 +128,8 @@ def _extract_wrapper_params(config: Dict[str, Any]) -> Dict[str, Any]:
         'normalize_prompt_weights': config.get('normalize_prompt_weights', True),
         'normalize_seed_weights': config.get('normalize_seed_weights', True),
         'enable_pytorch_fallback': config.get('enable_pytorch_fallback', False),
+        'scheduler': config.get('scheduler', 'lcm'),
+        'sampler': config.get('sampler', 'normal'),
     }
     if 'controlnets' in config and config['controlnets']:
         param_map['use_controlnet'] = True
diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index c605f51f..99899818 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -4,7 +4,15 @@
 import numpy as np
 import PIL.Image
 import torch
-from diffusers import LCMScheduler, StableDiffusionPipeline
+from diffusers import (
+    LCMScheduler, 
+    StableDiffusionPipeline, 
+    DPMSolverMultistepScheduler,
+    UniPCMultistepScheduler,
+    DDIMScheduler,
+    EulerDiscreteScheduler,
+    TCDScheduler,
+)
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import (
     retrieve_latents,
@@ -33,6 +41,8 @@ def __init__(
         cfg_type: Literal["none", "full", "self", "initialize"] = "self",
         normalize_prompt_weights: bool = True,
         normalize_seed_weights: bool = True,
+        scheduler: Literal["lcm", "tcd", "dpm++ 2m", "uni_pc", "ddim", "euler"] = "lcm",
+        sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = "normal",
     ) -> None:
         self.device = pipe.device
         self.dtype = torch_dtype
@@ -48,6 +58,8 @@ def __init__(
         self.denoising_steps_num = len(t_index_list)
 
         self.cfg_type = cfg_type
+        self.scheduler_type = scheduler
+        self.sampler_type = sampler
 
         # Detect model type
         detection_result = detect_model(pipe.unet, pipe)
@@ -84,7 +96,9 @@ def __init__(
         self.pipe = pipe
         self.image_processor = VaeImageProcessor(pipe.vae_scale_factor)
 
-        self.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
+        # Initialize scheduler based on configuration
+        self.scheduler = self._initialize_scheduler(scheduler, sampler, pipe.scheduler.config)
+        
         self.text_encoder = pipe.text_encoder
         self.unet = pipe.unet
         self.vae = pipe.vae
@@ -95,10 +109,75 @@ def __init__(
         if self.is_sdxl:
             self.add_text_embeds = None
             self.add_time_ids = None
+            logger.log(logging.INFO, f"[PIPELINE] SDXL Detected: Using {scheduler} scheduler with {sampler} sampler")
 
         # Initialize parameter updater
         self._param_updater = StreamParameterUpdater(self, normalize_prompt_weights, normalize_seed_weights)
 
+    def _initialize_scheduler(self, scheduler_type: str, sampler_type: str, config):
+        """Initialize scheduler based on type and sampler configuration."""
+        # Map sampler types to configuration parameters
+        sampler_config = {
+            "simple": {"timestep_spacing": "linspace"},
+            "sgm uniform": {"timestep_spacing": "trailing"},  # SGM Uniform is typically trailing
+            "normal": {},  # Default configuration
+            "ddim": {"timestep_spacing": "leading"},  # DDIM default per documentation
+            "beta": {"beta_schedule": "scaled_linear"},
+            "karras": {},  # Karras sigmas will be enabled in scheduler-specific code
+        }
+        
+        # Get sampler-specific configuration
+        sampler_params = sampler_config.get(sampler_type, {})
+
+        print(f"Sampler params: {sampler_params}")
+        print(f"Scheduler type: {scheduler_type}")
+        
+        # Create scheduler based on type
+        if scheduler_type == "lcm":
+            return LCMScheduler.from_config(config, **sampler_params)
+        elif scheduler_type == "tcd":
+            return TCDScheduler.from_config(config, **sampler_params)
+        elif scheduler_type == "dpm++ 2m":
+            # DPM++ 2M typically uses solver_order=2 and algorithm_type="dpmsolver++"
+            return DPMSolverMultistepScheduler.from_config(
+                config,
+                solver_order=2,
+                algorithm_type="dpmsolver++",
+                use_karras_sigmas=(sampler_type == "karras"),  # Enable Karras sigmas if requested
+                **sampler_params
+            )
+        elif scheduler_type == "uni_pc":
+            # UniPC: solver_order=2 for guided sampling, solver_type="bh2" by default
+            return UniPCMultistepScheduler.from_config(
+                config,
+                solver_order=2,  # Good default for guided sampling
+                solver_type="bh2",  # Default from documentation
+                disable_corrector=[],  # No corrector disabled by default
+                use_karras_sigmas=(sampler_type == "karras"),  # Enable Karras sigmas if requested
+                **sampler_params
+            )
+        elif scheduler_type == "ddim":
+            # DDIM defaults to leading timestep spacing, but trailing can be better
+            return DDIMScheduler.from_config(
+                config,
+                set_alpha_to_one=True,  # Default per documentation
+                steps_offset=0,  # Default per documentation
+                prediction_type="epsilon",  # Default per documentation
+                **sampler_params
+            )
+        elif scheduler_type == "euler":
+            # Euler can use Karras sigmas for improved quality
+            return EulerDiscreteScheduler.from_config(
+                config,
+                use_karras_sigmas=(sampler_type == "karras"),  # Enable Karras sigmas if requested
+                prediction_type="epsilon",  # Default per documentation
+                **sampler_params
+            )
+        else:
+            # Default to LCM
+            logger.warning(f"Unknown scheduler type '{scheduler_type}', falling back to LCM")
+            return LCMScheduler.from_config(config, **sampler_params)
+
     def load_lcm_lora(
         self,
         pretrained_model_name_or_path_or_dict: Union[
@@ -273,12 +352,42 @@ def prepare(
 
         # make sub timesteps list based on the indices in the t_list list and the values in the timesteps list
         self.sub_timesteps = []
+        max_timestep_index = len(self.timesteps) - 1
+        
         for t in self.t_list:
-            self.sub_timesteps.append(self.timesteps[t])
-
-        sub_timesteps_tensor = torch.tensor(
-            self.sub_timesteps, dtype=torch.long, device=self.device
-        )
+            # Clamp t_index to valid range to prevent index out of bounds
+            if t > max_timestep_index:
+                logger.warning(f"t_index {t} is out of bounds for scheduler with {len(self.timesteps)} timesteps. Clamping to {max_timestep_index}")
+                t = max_timestep_index
+            elif t < 0:
+                logger.warning(f"t_index {t} is negative. Clamping to 0")
+                t = 0
+                
+            timestep_value = self.timesteps[t]
+            # Convert tensor timesteps to scalar values for indexing operations
+            if isinstance(timestep_value, torch.Tensor):
+                timestep_scalar = timestep_value.cpu().item()
+            else:
+                timestep_scalar = timestep_value
+            self.sub_timesteps.append(timestep_scalar)
+
+        # Create tensor version for UNet calls
+        # Handle both integer and floating-point timesteps from different schedulers
+        # Some schedulers like Euler may return floating-point timesteps
+        if len(self.sub_timesteps) > 0:
+            # Always create the tensor from scalar values to avoid device issues
+            try:
+                # Try integer first for compatibility
+                sub_timesteps_tensor = torch.tensor(
+                    self.sub_timesteps, dtype=torch.long, device=self.device
+                )
+            except (TypeError, ValueError):
+                # Fallback for floating-point values
+                sub_timesteps_tensor = torch.tensor(
+                    self.sub_timesteps, dtype=torch.float32, device=self.device
+                )
+        else:
+            sub_timesteps_tensor = torch.tensor([], dtype=torch.long, device=self.device)
         self.sub_timesteps_tensor = torch.repeat_interleave(
             sub_timesteps_tensor,
             repeats=self.frame_bff_size if self.use_denoising_batch else 1,
@@ -292,12 +401,11 @@ def prepare(
 
         self.stock_noise = torch.zeros_like(self.init_noise)
 
+        # Handle scheduler-specific scaling calculations
         c_skip_list = []
         c_out_list = []
         for timestep in self.sub_timesteps:
-            c_skip, c_out = self.scheduler.get_scalings_for_boundary_condition_discrete(
-                timestep
-            )
+            c_skip, c_out = self._get_scheduler_scalings(timestep)
             c_skip_list.append(c_skip)
             c_out_list.append(c_out)
 
@@ -315,8 +423,25 @@ def prepare(
         alpha_prod_t_sqrt_list = []
         beta_prod_t_sqrt_list = []
         for timestep in self.sub_timesteps:
-            alpha_prod_t_sqrt = self.scheduler.alphas_cumprod[timestep].sqrt()
-            beta_prod_t_sqrt = (1 - self.scheduler.alphas_cumprod[timestep]).sqrt()
+            # Convert floating-point timesteps to integers for tensor indexing
+            if isinstance(timestep, float):
+                timestep_idx = int(round(timestep))
+            else:
+                timestep_idx = timestep
+            
+            # Ensure timestep_idx is within bounds
+            max_idx = len(self.scheduler.alphas_cumprod) - 1
+            if timestep_idx > max_idx:
+                logger.warning(f"Timestep index {timestep_idx} out of bounds for alphas_cumprod (max: {max_idx}). Clamping to {max_idx}")
+                timestep_idx = max_idx
+            elif timestep_idx < 0:
+                logger.warning(f"Timestep index {timestep_idx} is negative. Clamping to 0")
+                timestep_idx = 0
+            
+            # Access scheduler tensors and move to device as needed
+            alpha_cumprod = self.scheduler.alphas_cumprod[timestep_idx].to(device=self.device, dtype=self.dtype)
+            alpha_prod_t_sqrt = alpha_cumprod.sqrt()
+            beta_prod_t_sqrt = (1 - alpha_cumprod).sqrt()
             alpha_prod_t_sqrt_list.append(alpha_prod_t_sqrt)
             beta_prod_t_sqrt_list.append(beta_prod_t_sqrt)
         alpha_prod_t_sqrt = (
@@ -342,7 +467,8 @@ def prepare(
         #NOTE: this is a hack. Pipeline needs a major refactor along with stream parameter updater. 
         self.update_prompt(prompt)
 
-        if not self.use_denoising_batch:
+        # Only collapse tensors to a single element for non-batched LCM path.
+        if (not self.use_denoising_batch) and self._uses_lcm_logic():
             self.sub_timesteps_tensor = self.sub_timesteps_tensor[0]
             self.alpha_prod_t_sqrt = self.alpha_prod_t_sqrt[0]
             self.beta_prod_t_sqrt = self.beta_prod_t_sqrt[0]
@@ -351,6 +477,31 @@ def prepare(
         self.c_skip = self.c_skip.to(self.device)
         self.c_out = self.c_out.to(self.device)
 
+    def _get_scheduler_scalings(self, timestep):
+        """
+        Get LCM-specific scaling factors for boundary conditions.
+        Only used for LCMScheduler - other schedulers handle scaling in their step() method.
+        """
+        if isinstance(self.scheduler, LCMScheduler):
+            c_skip, c_out = self.scheduler.get_scalings_for_boundary_condition_discrete(timestep)
+            # Ensure returned values are tensors on the correct device
+            if not isinstance(c_skip, torch.Tensor):
+                c_skip = torch.tensor(c_skip, device=self.device, dtype=self.dtype)
+            else:
+                c_skip = c_skip.to(device=self.device, dtype=self.dtype)
+            if not isinstance(c_out, torch.Tensor):
+                c_out = torch.tensor(c_out, device=self.device, dtype=self.dtype)
+            else:
+                c_out = c_out.to(device=self.device, dtype=self.dtype)
+            return c_skip, c_out
+        else:
+            # For non-LCM schedulers, we don't use boundary condition scaling
+            # Their step() method handles all the necessary scaling internally
+            logger.debug(f"Scheduler {type(self.scheduler)} doesn't use boundary condition scaling")
+            c_skip = torch.tensor(1.0, device=self.device, dtype=self.dtype)
+            c_out = torch.tensor(1.0, device=self.device, dtype=self.dtype)
+            return c_skip, c_out
+
     @torch.no_grad()
     def update_prompt(self, prompt: str) -> None:
         self._param_updater.update_stream_params(
@@ -433,7 +584,55 @@ def get_normalize_seed_weights(self) -> bool:
         """Get the current seed weight normalization setting."""
         return self._param_updater.get_normalize_seed_weights()
 
+    def set_scheduler(
+        self, 
+        scheduler: Literal["lcm", "tcd", "dpm++ 2m", "uni_pc", "ddim", "euler"] = None,
+        sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = None,
+    ) -> None:
+        """
+        Change the scheduler and/or sampler configuration at runtime.
+        
+        Parameters
+        ----------
+        scheduler : str, optional
+            The scheduler type to use. If None, keeps current scheduler.
+        sampler : str, optional
+            The sampler type to use. If None, keeps current sampler.
+        """
+        if scheduler is not None:
+            self.scheduler_type = scheduler
+        if sampler is not None:
+            self.sampler_type = sampler
+            
+        # Re-initialize scheduler with new configuration
+        self.scheduler = self._initialize_scheduler(
+            self.scheduler_type, 
+            self.sampler_type, 
+            self.pipe.scheduler.config
+        )
+        
+        logger.info(f"Scheduler changed to {self.scheduler_type} with {self.sampler_type} sampler")
+
+
 
+    def _uses_lcm_logic(self) -> bool:
+        """Return True if scheduler uses consistency boundary-condition math (LCM/TCD)."""
+        try:
+            # Use isinstance checks for more reliable detection
+            return isinstance(self.scheduler, LCMScheduler)
+        except Exception:
+            return False
+
+    def _warned_cfg_mode_fallback(self) -> bool:
+        return getattr(self, "_cfg_mode_warning_emitted", False)
+
+    def _emit_cfg_mode_warning_once(self) -> None:
+        if not self._warned_cfg_mode_fallback():
+            logger.warning(
+                "Non-LCM scheduler in use: falling back to standard CFG ('full') semantics. "
+                "Custom cfg_type values 'self'/'initialize' are ignored for correctness."
+            )
+            setattr(self, "_cfg_mode_warning_emitted", True)
 
     def add_noise(
         self,
@@ -453,19 +652,36 @@ def scheduler_step_batch(
         x_t_latent_batch: torch.Tensor,
         idx: Optional[int] = None,
     ) -> torch.Tensor:
-        # TODO: use t_list to select beta_prod_t_sqrt
-        if idx is None:
-            F_theta = (
-                x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch
-            ) / self.alpha_prod_t_sqrt
-            denoised_batch = self.c_out * F_theta + self.c_skip * x_t_latent_batch
+        """
+        Simplified scheduler integration that works with StreamDiffusion's architecture.
+        For now, we'll use a hybrid approach until we can properly refactor the pipeline.
+        """
+        # For LCM, use boundary condition scaling as before
+        if self._uses_lcm_logic():
+            if idx is None:
+                F_theta = (
+                    x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch
+                ) / self.alpha_prod_t_sqrt
+                denoised_batch = self.c_out * F_theta + self.c_skip * x_t_latent_batch
+            else:
+                F_theta = (
+                    x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
+                ) / self.alpha_prod_t_sqrt[idx]
+                denoised_batch = (
+                    self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
+                )
         else:
-            F_theta = (
-                x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
-            ) / self.alpha_prod_t_sqrt[idx]
-            denoised_batch = (
-                self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
-            )
+            # For other schedulers, use simple epsilon denoising
+            # This is what works reliably with StreamDiffusion's current architecture
+            if idx is not None and idx < len(self.alpha_prod_t_sqrt):
+                denoised_batch = (
+                    x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
+                ) / self.alpha_prod_t_sqrt[idx]
+            else:
+                # Fallback to first timestep if idx is out of bounds
+                denoised_batch = (
+                    x_t_latent_batch - self.beta_prod_t_sqrt[0] * model_pred_batch
+                ) / self.alpha_prod_t_sqrt[0]
 
         return denoised_batch
 
@@ -475,6 +691,11 @@ def unet_step(
         t_list: Union[torch.Tensor, list[int]],
         idx: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Original StreamDiffusion UNet call that returns a denoised latent batch using
+        LCM math or a simplified epsilon inversion. For non-LCM schedulers we will
+        prefer the scheduler.step() path elsewhere; this function is kept for LCM.
+        """
         if self.guidance_scale > 1.0 and (self.cfg_type == "initialize"):
             x_t_latent_plus_uc = torch.concat([x_t_latent[0:1], x_t_latent], dim=0)
             t_list = torch.concat([t_list[0:1], t_list], dim=0)
@@ -635,6 +856,122 @@ def unet_step(
 
         return denoised_batch, model_pred
 
+    def _call_unet(
+        self,
+        sample: torch.Tensor,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        """Call the UNet, handling SDXL kwargs and TensorRT engine calling convention."""
+        added_cond_kwargs = added_cond_kwargs or {}
+        if self.is_sdxl:
+            try:
+                # Detect TensorRT engine vs PyTorch UNet
+                is_tensorrt_engine = hasattr(self.unet, 'engine') and hasattr(self.unet, 'stream')
+                if is_tensorrt_engine:
+                    out = self.unet(
+                        sample,
+                        timestep,
+                        encoder_hidden_states,
+                        **added_cond_kwargs,
+                    )[0]
+                else:
+                    out = self.unet(
+                        sample=sample,
+                        timestep=timestep,
+                        encoder_hidden_states=encoder_hidden_states,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+            except Exception as e:
+                logger.error(f"[PIPELINE] _call_unet: SDXL UNet call failed: {e}")
+                import traceback
+                traceback.print_exc()
+                raise
+        else:
+            out = self.unet(
+                sample,
+                timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                return_dict=False,
+            )[0]
+        return out
+
+    def _unet_predict_noise_cfg(
+        self,
+        latent_model_input: torch.Tensor,
+        timestep: torch.Tensor,
+        cfg_mode: Literal["none", "full", "self", "initialize"],
+    ) -> torch.Tensor:
+        """
+        Compute noise prediction from UNet with classifier-free guidance applied.
+        This function does not apply any scheduler math; it only returns the guided noise.
+
+        For non-LCM schedulers, custom cfg_mode values 'self'/'initialize' are treated
+        as 'full' to ensure correctness with scheduler.step().
+        """
+        effective_cfg = cfg_mode
+        if not self._uses_lcm_logic() and cfg_mode in ("self", "initialize"):
+            self._emit_cfg_mode_warning_once()
+            effective_cfg = "full"
+
+        # Build latent batch for CFG
+        if self.guidance_scale > 1.0 and effective_cfg == "full":
+            latent_with_uc = torch.cat([latent_model_input, latent_model_input], dim=0)
+        elif self.guidance_scale > 1.0 and effective_cfg == "initialize":
+            # Keep initialize behavior for LCM only; if we reach here, LCM path
+            latent_with_uc = torch.cat([latent_model_input[0:1], latent_model_input], dim=0)
+        else:
+            latent_with_uc = latent_model_input
+
+        # SDXL added conditioning replication to match batch
+        added_cond_kwargs: Dict[str, torch.Tensor] = {}
+        if self.is_sdxl and hasattr(self, 'add_text_embeds') and hasattr(self, 'add_time_ids'):
+            if self.add_text_embeds is not None and self.add_time_ids is not None:
+                batch_size = latent_with_uc.shape[0]
+                if self.guidance_scale > 1.0 and effective_cfg == "initialize":
+                    add_text_embeds = torch.cat([
+                        self.add_text_embeds[0:1],
+                        self.add_text_embeds[1:2].repeat(batch_size - 1, 1),
+                    ], dim=0)
+                    add_time_ids = torch.cat([
+                        self.add_time_ids[0:1],
+                        self.add_time_ids[1:2].repeat(batch_size - 1, 1),
+                    ], dim=0)
+                elif self.guidance_scale > 1.0 and effective_cfg == "full":
+                    repeat_factor = batch_size // 2
+                    add_text_embeds = self.add_text_embeds.repeat(repeat_factor, 1)
+                    add_time_ids = self.add_time_ids.repeat(repeat_factor, 1)
+                else:
+                    add_text_embeds = (
+                        self.add_text_embeds[1:2].repeat(batch_size, 1)
+                        if self.add_text_embeds.shape[0] > 1
+                        else self.add_text_embeds.repeat(batch_size, 1)
+                    )
+                    add_time_ids = (
+                        self.add_time_ids[1:2].repeat(batch_size, 1)
+                        if self.add_time_ids.shape[0] > 1
+                        else self.add_time_ids.repeat(batch_size, 1)
+                    )
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        # Call UNet
+        model_pred = self._call_unet(
+            sample=latent_with_uc,
+            timestep=timestep,
+            encoder_hidden_states=self.prompt_embeds,
+            added_cond_kwargs=added_cond_kwargs,
+        )
+
+        # Apply CFG
+        if self.guidance_scale > 1.0 and effective_cfg == "full":
+            noise_pred_uncond, noise_pred_text = model_pred.chunk(2)
+            guided = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+            return guided
+        else:
+            return model_pred
+
     def encode_image(self, image_tensors: torch.Tensor) -> torch.Tensor:
         image_tensors = image_tensors.to(
             device=self.device,
@@ -669,7 +1006,8 @@ def decode_image(self, x_0_pred_out: torch.Tensor) -> torch.Tensor:
     def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
         prev_latent_batch = self.x_t_latent_buffer
 
-        if self.use_denoising_batch:
+        # LCM supports our denoising-batch trick. Other schedulers should use step() sequentially
+        if self.use_denoising_batch and self._uses_lcm_logic():
             t_list = self.sub_timesteps_tensor
             
             if self.denoising_steps_num > 1:
@@ -697,24 +1035,36 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
                 x_0_pred_out = x_0_pred_batch
                 self.x_t_latent_buffer = None
         else:
-            self.init_noise = x_t_latent
-            for idx, t in enumerate(self.sub_timesteps_tensor):
-                t = t.view(1,).repeat(self.frame_bff_size,)
-                
-                x_0_pred, model_pred = self.unet_step(x_t_latent, t, idx)
-                
-                if idx < len(self.sub_timesteps_tensor) - 1:
-                    if self.do_add_noise:
-                        x_t_latent = self.alpha_prod_t_sqrt[
-                            idx + 1
-                        ] * x_0_pred + self.beta_prod_t_sqrt[
-                            idx + 1
-                        ] * torch.randn_like(
-                            x_0_pred, device=self.device, dtype=self.dtype
-                        )
-                    else:
-                        x_t_latent = self.alpha_prod_t_sqrt[idx + 1] * x_0_pred
-            x_0_pred_out = x_0_pred
+            # Standard scheduler loop using scale_model_input + scheduler.step()
+            sample = x_t_latent
+            for idx, timestep in enumerate(self.sub_timesteps_tensor):
+                # Ensure timestep tensor on device with correct dtype
+                if not isinstance(timestep, torch.Tensor):
+                    t = torch.tensor(timestep, device=self.device, dtype=torch.long)
+                else:
+                    t = timestep.to(self.device)
+
+                # Scale model input per scheduler requirements
+                model_input = (
+                    self.scheduler.scale_model_input(sample, t)
+                    if hasattr(self.scheduler, "scale_model_input")
+                    else sample
+                )
+
+                # Predict noise with CFG
+                noise_pred = self._unet_predict_noise_cfg(
+                    latent_model_input=model_input,
+                    timestep=t,
+                    cfg_mode=self.cfg_type,
+                )
+
+                # Advance one step
+                step_out = self.scheduler.step(noise_pred, t, sample)
+                # diffusers returns a SchedulerOutput; prefer .prev_sample if present
+                sample = getattr(step_out, "prev_sample", step_out[0] if isinstance(step_out, (tuple, list)) else step_out)
+
+            # After final step, sample approximates x0 latent
+            x_0_pred_out = sample
 
         return x_0_pred_out
 
diff --git a/src/streamdiffusion/stream_parameter_updater.py b/src/streamdiffusion/stream_parameter_updater.py
index 1e43fded..a1ab89a8 100644
--- a/src/streamdiffusion/stream_parameter_updater.py
+++ b/src/streamdiffusion/stream_parameter_updater.py
@@ -276,6 +276,7 @@ def update_stream_params(
         seed_list: Optional[List[Tuple[int, float]]] = None,
         seed_interpolation_method: Literal["linear", "slerp"] = "linear",
         normalize_seed_weights: Optional[bool] = None,
+        ipadapter_config: Optional[Dict[str, Any]] = None,
     ) -> None:
         """Update streaming parameters efficiently in a single call."""
 
@@ -676,7 +677,7 @@ def _recalculate_timestep_dependent_params(self, t_index_list: List[int]) -> Non
         c_skip_list = []
         c_out_list = []
         for timestep in self.stream.sub_timesteps:
-            c_skip, c_out = self.stream.scheduler.get_scalings_for_boundary_condition_discrete(timestep)
+            c_skip, c_out = self.stream._get_scheduler_scalings(timestep)
             c_skip_list.append(c_skip)
             c_out_list.append(c_out)
 
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 96f12a0b..747d5d1b 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -103,6 +103,9 @@ def __init__(
         build_engines_if_missing: bool = True,
         normalize_prompt_weights: bool = True,
         normalize_seed_weights: bool = True,
+        # Scheduler and sampler options
+        scheduler: Literal["lcm", "dpm++ 2m", "uni_pc", "ddim", "euler"] = "lcm",
+        sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = "normal",
         # ControlNet options
         use_controlnet: bool = False,
         controlnet_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
@@ -182,6 +185,10 @@ def __init__(
         normalize_seed_weights : bool, optional
             Whether to normalize seed weights in blending to sum to 1,
             by default True. When False, weights > 1 will amplify noise.
+        scheduler : Literal["lcm", "dpm++ 2m", "uni_pc", "ddim", "euler"], optional
+            The scheduler type to use for denoising, by default "lcm".
+        sampler : Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"], optional
+            The sampler type to use for noise scheduling, by default "normal".
         use_controlnet : bool, optional
             Whether to enable ControlNet support, by default False.
         controlnet_config : Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional
@@ -251,6 +258,8 @@ def __init__(
             build_engines_if_missing=build_engines_if_missing,
             normalize_prompt_weights=normalize_prompt_weights,
             normalize_seed_weights=normalize_seed_weights,
+            scheduler=scheduler,
+            sampler=sampler,
             use_controlnet=use_controlnet,
             controlnet_config=controlnet_config,
             enable_pytorch_fallback=enable_pytorch_fallback,
@@ -499,6 +508,24 @@ def get_normalize_seed_weights(self) -> bool:
         """Get the current seed weight normalization setting."""
         return self.stream.get_normalize_seed_weights()
 
+    def set_scheduler(
+        self, 
+        scheduler: Literal["lcm", "dpm++ 2m", "uni_pc", "ddim", "euler"] = None,
+        sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = None,
+    ) -> None:
+        """
+        Change the scheduler and/or sampler configuration at runtime.
+        
+        Parameters
+        ----------
+        scheduler : str, optional
+            The scheduler type to use. If None, keeps current scheduler.
+        sampler : str, optional
+            The sampler type to use. If None, keeps current sampler.
+        """
+        logger.info(f"Setting scheduler to {scheduler} and sampler to {sampler}")
+        self.stream.set_scheduler(scheduler=scheduler, sampler=sampler)
+
     def __call__(
         self,
         image: Optional[Union[str, Image.Image, torch.Tensor]] = None,
@@ -766,6 +793,8 @@ def _load_model(
         build_engines_if_missing: bool = True,
         normalize_prompt_weights: bool = True,
         normalize_seed_weights: bool = True,
+        scheduler: Literal["lcm", "dpm++ 2m", "uni_pc", "ddim", "euler"] = "lcm",
+        sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = "normal",
         use_controlnet: bool = False,
         controlnet_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
         enable_pytorch_fallback: bool = False,
@@ -935,6 +964,8 @@ def _load_model(
             cfg_type=cfg_type,
             normalize_prompt_weights=normalize_prompt_weights,
             normalize_seed_weights=normalize_seed_weights,
+            scheduler=scheduler,
+            sampler=sampler,
         )
         if not self.sd_turbo:
             if use_lcm_lora:
@@ -948,6 +979,7 @@ def _load_model(
 
             if lora_dict is not None:
                 for lora_name, lora_scale in lora_dict.items():
+                    logger.info(f"_load_model: Loading LoRA '{lora_name}' with scale {lora_scale}")
                     stream.load_lora(lora_name)
                     stream.fuse_lora(lora_scale=lora_scale)
 

From 44077245be8aa6954a570e238c40245a486e2281 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 12 Aug 2025 15:32:21 -0400
Subject: [PATCH 02/16] Added lora signature to engine name.

---
 .../acceleration/tensorrt/engine_manager.py   | 24 ++++++++++++++++++-
 src/streamdiffusion/pipeline.py               |  1 +
 src/streamdiffusion/wrapper.py                |  4 +++-
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
index 8649e303..e9d0ed4b 100644
--- a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
+++ b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
@@ -1,4 +1,6 @@
 import os
+import re
+import hashlib
 from enum import Enum
 from typing import Any, Optional, Dict
 from pathlib import Path
@@ -67,7 +69,22 @@ def __init__(self, engine_dir: str):
                 )
             }
         }
-    
+
+    def _lora_signature(self, lora_dict: Dict[str, float]) -> str:
+        """Create a short, stable signature for a set of LoRAs.
+
+        Uses sorted basenames and weights, hashed to a short hex to avoid
+        long/invalid paths while keeping cache keys stable across runs.
+        """
+        # Build canonical string of basename:weight pairs
+        parts = []
+        for path, weight in sorted(lora_dict.items(), key=lambda x: str(x[0])):
+            base = Path(str(path)).name  # basename only
+            parts.append(f"{base}:{weight}")
+        canon = "|".join(parts)
+        h = hashlib.sha1(canon.encode("utf-8")).hexdigest()[:10]
+        return f"{len(lora_dict)}-{h}"
+
     def get_engine_path(self, 
                        engine_type: EngineType,
                        model_id_or_path: str,
@@ -76,6 +93,7 @@ def get_engine_path(self,
                        mode: str,
                        use_lcm_lora: bool,
                        use_tiny_vae: bool,
+                       lora_dict: Optional[Dict[str, float]] = None,
                        ipadapter_scale: Optional[float] = None,
                        ipadapter_tokens: Optional[int] = None,
                        controlnet_model_id: Optional[str] = None) -> Path:
@@ -111,6 +129,10 @@ def get_engine_path(self,
                 prefix += f"--ipa{ipadapter_scale}"
             if ipadapter_tokens is not None:
                 prefix += f"--tokens{ipadapter_tokens}"
+
+            # Fused Loras - use concise hashed signature to avoid long/invalid paths
+            if lora_dict is not None and len(lora_dict) > 0:
+                prefix += f"--lora-{self._lora_signature(lora_dict)}"
             
             prefix += f"--mode-{mode}"
             
diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index 4126784f..918cc6dd 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -39,6 +39,7 @@ def __init__(
         use_denoising_batch: bool = True,
         frame_buffer_size: int = 1,
         cfg_type: Literal["none", "full", "self", "initialize"] = "self",
+        lora_dict: Optional[Dict[str, float]] = None,
         normalize_prompt_weights: bool = True,
         normalize_seed_weights: bool = True,
         scheduler: Literal["lcm", "tcd", "dpm++ 2m", "uni_pc", "ddim", "euler"] = "lcm",
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 445ad9ae..9f294703 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -979,6 +979,7 @@ def _load_model(
             frame_buffer_size=self.frame_buffer_size,
             use_denoising_batch=self.use_denoising_batch,
             cfg_type=cfg_type,
+            lora_dict=lora_dict, # We pass this to include loras in engine path names
             normalize_prompt_weights=normalize_prompt_weights,
             normalize_seed_weights=normalize_seed_weights,
             scheduler=scheduler,
@@ -1154,7 +1155,8 @@ def _load_model(
                     use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
                     ipadapter_scale=ipadapter_scale,
-                    ipadapter_tokens=ipadapter_tokens
+                    ipadapter_tokens=ipadapter_tokens,
+                    lora_dict=lora_dict
                 )
                 vae_encoder_path = engine_manager.get_engine_path(
                     EngineType.VAE_ENCODER,

From f79a59cd6b7cefcc356f33df1bc5772d40e381fa Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Sat, 23 Aug 2025 14:51:42 -0400
Subject: [PATCH 03/16] Clean up of scheduler/samplers that weren't working,
 fix to controlnets and SDXL.

---
 .../acceleration/tensorrt/utilities.py        |  23 ++
 src/streamdiffusion/pipeline.py               | 365 ++++++------------
 2 files changed, 136 insertions(+), 252 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index 2714d2ca..4dd95120 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -331,6 +331,29 @@ def _can_reuse_buffers(self, shape_dict=None, device="cuda"):
         return True
 
     def infer(self, feed_dict, stream, use_cuda_graph=False):
+        # Filter inputs to only those the engine actually exposes to avoid binding errors
+        try:
+            allowed_inputs = set()
+            for idx in range(self.engine.num_io_tensors):
+                name = self.engine.get_tensor_name(idx)
+                if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    allowed_inputs.add(name)
+
+            # Drop any extra keys (e.g., text_embeds/time_ids) that the engine was not built to accept
+            if allowed_inputs:
+                filtered_feed_dict = {k: v for k, v in feed_dict.items() if k in allowed_inputs}
+                if len(filtered_feed_dict) != len(feed_dict):
+                    missing = [k for k in feed_dict.keys() if k not in allowed_inputs]
+                    if missing:
+                        logger.debug(
+                            "TensorRT Engine: filtering unsupported inputs %s (allowed=%s)",
+                            missing, sorted(list(allowed_inputs))
+                        )
+                feed_dict = filtered_feed_dict
+        except Exception:
+            # Be permissive if engine query fails; proceed with original dict
+            pass
+        
         for name, buf in feed_dict.items():
             self.tensors[name].copy_(buf)
 
diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index 6ac81cf1..62c77122 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -4,15 +4,7 @@
 import numpy as np
 import PIL.Image
 import torch
-from diffusers import (
-    LCMScheduler, 
-    StableDiffusionPipeline, 
-    DPMSolverMultistepScheduler,
-    UniPCMultistepScheduler,
-    DDIMScheduler,
-    EulerDiscreteScheduler,
-    TCDScheduler,
-)
+from diffusers import LCMScheduler, TCDScheduler, StableDiffusionPipeline
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import (
     retrieve_latents,
@@ -43,7 +35,7 @@ def __init__(
         lora_dict: Optional[Dict[str, float]] = None,
         normalize_prompt_weights: bool = True,
         normalize_seed_weights: bool = True,
-        scheduler: Literal["lcm", "tcd", "dpm++ 2m", "uni_pc", "ddim", "euler"] = "lcm",
+        scheduler: Literal["lcm", "tcd"] = "lcm",
         sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = "normal",
     ) -> None:
         self.device = pipe.device
@@ -70,7 +62,15 @@ def __init__(
         self.is_turbo = detection_result['is_turbo']
         self.detection_confidence = detection_result['confidence']
     
-        if use_denoising_batch:
+        # TCD scheduler is incompatible with denoising batch optimization due to Strategic Stochastic Sampling
+        # Force sequential processing for TCD
+        if scheduler == "tcd":
+            logger.info("TCD scheduler detected: Disabling denoising batch optimization for compatibility")
+            self.use_denoising_batch = False
+            self.batch_size = frame_buffer_size
+            self.trt_unet_batch_size = frame_buffer_size
+        elif use_denoising_batch:
+            self.use_denoising_batch = True
             self.batch_size = self.denoising_steps_num * frame_buffer_size
             if self.cfg_type == "initialize":
                 self.trt_unet_batch_size = (
@@ -83,13 +83,12 @@ def __init__(
             else:
                 self.trt_unet_batch_size = self.denoising_steps_num * frame_buffer_size
         else:
+            self.use_denoising_batch = False
             self.trt_unet_batch_size = self.frame_bff_size
             self.batch_size = frame_buffer_size
 
         self.t_list = t_index_list
-
         self.do_add_noise = do_add_noise
-        self.use_denoising_batch = use_denoising_batch
 
         self.similar_image_filter = False
         self.similar_filter = SimilarImageFilter()
@@ -97,8 +96,6 @@ def __init__(
 
         self.pipe = pipe
         self.image_processor = VaeImageProcessor(pipe.vae_scale_factor)
-
-        # Initialize scheduler based on configuration
         self.scheduler = self._initialize_scheduler(scheduler, sampler, pipe.scheduler.config)
         
         self.text_encoder = pipe.text_encoder
@@ -111,7 +108,6 @@ def __init__(
         if self.is_sdxl:
             self.add_text_embeds = None
             self.add_time_ids = None
-            logger.log(logging.INFO, f"[PIPELINE] SDXL Detected: Using {scheduler} scheduler with {sampler} sampler")
 
         # Initialize parameter updater
         self._param_updater = StreamParameterUpdater(self, normalize_prompt_weights, normalize_seed_weights)
@@ -131,7 +127,29 @@ def __init__(
         self._cached_batch_size: Optional[int] = None
         self._cached_cfg_type: Optional[str] = None
         self._cached_guidance_scale: Optional[float] = None
+
+    def _initialize_scheduler(self, scheduler_type: str, sampler_type: str, config):
+        """Initialize scheduler based on type and sampler configuration."""
+        # Map sampler types to configuration parameters
+        sampler_config = {
+            "simple": {"timestep_spacing": "linspace"},
+            "sgm uniform": {"timestep_spacing": "trailing"},
+            "normal": {},  # Default configuration
+            "ddim": {"timestep_spacing": "leading"},
+            "beta": {"beta_schedule": "scaled_linear"},
+            "karras": {},  # Karras sigmas handled per scheduler
+        }
+        
+        # Get sampler-specific configuration
+        sampler_params = sampler_config.get(sampler_type, {})
         
+        if scheduler_type == "lcm":
+            return LCMScheduler.from_config(config, **sampler_params)
+        elif scheduler_type == "tcd":
+            return TCDScheduler.from_config(config, **sampler_params)
+        else:
+            logger.warning(f"Unknown scheduler type '{scheduler_type}', falling back to LCM")
+            return LCMScheduler.from_config(config, **sampler_params)
 
     def _check_unet_tensorrt(self) -> bool:
         """Cache TensorRT detection to avoid repeated hasattr calls"""
@@ -196,69 +214,7 @@ def _build_sdxl_conditioning(self, batch_size: int) -> Dict[str, torch.Tensor]:
             'time_ids': add_time_ids
         }
 
-    def _initialize_scheduler(self, scheduler_type: str, sampler_type: str, config):
-        """Initialize scheduler based on type and sampler configuration."""
-        # Map sampler types to configuration parameters
-        sampler_config = {
-            "simple": {"timestep_spacing": "linspace"},
-            "sgm uniform": {"timestep_spacing": "trailing"},  # SGM Uniform is typically trailing
-            "normal": {},  # Default configuration
-            "ddim": {"timestep_spacing": "leading"},  # DDIM default per documentation
-            "beta": {"beta_schedule": "scaled_linear"},
-            "karras": {},  # Karras sigmas will be enabled in scheduler-specific code
-        }
-        
-        # Get sampler-specific configuration
-        sampler_params = sampler_config.get(sampler_type, {})
 
-        print(f"Sampler params: {sampler_params}")
-        print(f"Scheduler type: {scheduler_type}")
-        
-        # Create scheduler based on type
-        if scheduler_type == "lcm":
-            return LCMScheduler.from_config(config, **sampler_params)
-        elif scheduler_type == "tcd":
-            return TCDScheduler.from_config(config, **sampler_params)
-        elif scheduler_type == "dpm++ 2m":
-            # DPM++ 2M typically uses solver_order=2 and algorithm_type="dpmsolver++"
-            return DPMSolverMultistepScheduler.from_config(
-                config,
-                solver_order=2,
-                algorithm_type="dpmsolver++",
-                use_karras_sigmas=(sampler_type == "karras"),  # Enable Karras sigmas if requested
-                **sampler_params
-            )
-        elif scheduler_type == "uni_pc":
-            # UniPC: solver_order=2 for guided sampling, solver_type="bh2" by default
-            return UniPCMultistepScheduler.from_config(
-                config,
-                solver_order=2,  # Good default for guided sampling
-                solver_type="bh2",  # Default from documentation
-                disable_corrector=[],  # No corrector disabled by default
-                use_karras_sigmas=(sampler_type == "karras"),  # Enable Karras sigmas if requested
-                **sampler_params
-            )
-        elif scheduler_type == "ddim":
-            # DDIM defaults to leading timestep spacing, but trailing can be better
-            return DDIMScheduler.from_config(
-                config,
-                set_alpha_to_one=True,  # Default per documentation
-                steps_offset=0,  # Default per documentation
-                prediction_type="epsilon",  # Default per documentation
-                **sampler_params
-            )
-        elif scheduler_type == "euler":
-            # Euler can use Karras sigmas for improved quality
-            return EulerDiscreteScheduler.from_config(
-                config,
-                use_karras_sigmas=(sampler_type == "karras"),  # Enable Karras sigmas if requested
-                prediction_type="epsilon",  # Default per documentation
-                **sampler_params
-            )
-        else:
-            # Default to LCM
-            logger.warning(f"Unknown scheduler type '{scheduler_type}', falling back to LCM")
-            return LCMScheduler.from_config(config, **sampler_params)
 
     def load_lcm_lora(
         self,
@@ -495,42 +451,12 @@ def prepare(
 
         # make sub timesteps list based on the indices in the t_list list and the values in the timesteps list
         self.sub_timesteps = []
-        max_timestep_index = len(self.timesteps) - 1
-        
         for t in self.t_list:
-            # Clamp t_index to valid range to prevent index out of bounds
-            if t > max_timestep_index:
-                logger.warning(f"t_index {t} is out of bounds for scheduler with {len(self.timesteps)} timesteps. Clamping to {max_timestep_index}")
-                t = max_timestep_index
-            elif t < 0:
-                logger.warning(f"t_index {t} is negative. Clamping to 0")
-                t = 0
-                
-            timestep_value = self.timesteps[t]
-            # Convert tensor timesteps to scalar values for indexing operations
-            if isinstance(timestep_value, torch.Tensor):
-                timestep_scalar = timestep_value.cpu().item()
-            else:
-                timestep_scalar = timestep_value
-            self.sub_timesteps.append(timestep_scalar)
-
-        # Create tensor version for UNet calls
-        # Handle both integer and floating-point timesteps from different schedulers
-        # Some schedulers like Euler may return floating-point timesteps
-        if len(self.sub_timesteps) > 0:
-            # Always create the tensor from scalar values to avoid device issues
-            try:
-                # Try integer first for compatibility
-                sub_timesteps_tensor = torch.tensor(
-                    self.sub_timesteps, dtype=torch.long, device=self.device
-                )
-            except (TypeError, ValueError):
-                # Fallback for floating-point values
-                sub_timesteps_tensor = torch.tensor(
-                    self.sub_timesteps, dtype=torch.float32, device=self.device
-                )
-        else:
-            sub_timesteps_tensor = torch.tensor([], dtype=torch.long, device=self.device)
+            self.sub_timesteps.append(self.timesteps[t])
+
+        sub_timesteps_tensor = torch.tensor(
+            self.sub_timesteps, dtype=torch.long, device=self.device
+        )
         self.sub_timesteps_tensor = torch.repeat_interleave(
             sub_timesteps_tensor,
             repeats=self.frame_bff_size if self.use_denoising_batch else 1,
@@ -566,25 +492,8 @@ def prepare(
         alpha_prod_t_sqrt_list = []
         beta_prod_t_sqrt_list = []
         for timestep in self.sub_timesteps:
-            # Convert floating-point timesteps to integers for tensor indexing
-            if isinstance(timestep, float):
-                timestep_idx = int(round(timestep))
-            else:
-                timestep_idx = timestep
-            
-            # Ensure timestep_idx is within bounds
-            max_idx = len(self.scheduler.alphas_cumprod) - 1
-            if timestep_idx > max_idx:
-                logger.warning(f"Timestep index {timestep_idx} out of bounds for alphas_cumprod (max: {max_idx}). Clamping to {max_idx}")
-                timestep_idx = max_idx
-            elif timestep_idx < 0:
-                logger.warning(f"Timestep index {timestep_idx} is negative. Clamping to 0")
-                timestep_idx = 0
-            
-            # Access scheduler tensors and move to device as needed
-            alpha_cumprod = self.scheduler.alphas_cumprod[timestep_idx].to(device=self.device, dtype=self.dtype)
-            alpha_prod_t_sqrt = alpha_cumprod.sqrt()
-            beta_prod_t_sqrt = (1 - alpha_cumprod).sqrt()
+            alpha_prod_t_sqrt = self.scheduler.alphas_cumprod[timestep].sqrt()
+            beta_prod_t_sqrt = (1 - self.scheduler.alphas_cumprod[timestep]).sqrt()
             alpha_prod_t_sqrt_list.append(alpha_prod_t_sqrt)
             beta_prod_t_sqrt_list.append(beta_prod_t_sqrt)
         alpha_prod_t_sqrt = (
@@ -610,8 +519,9 @@ def prepare(
         #NOTE: this is a hack. Pipeline needs a major refactor along with stream parameter updater. 
         self.update_prompt(prompt)
 
-        # Only collapse tensors to a single element for non-batched LCM path.
-        if (not self.use_denoising_batch) and self._uses_lcm_logic():
+        # Only collapse tensors to scalars for LCM non-batched mode
+        # TCD needs to keep tensor dimensions for iteration
+        if not self.use_denoising_batch and isinstance(self.scheduler, LCMScheduler):
             self.sub_timesteps_tensor = self.sub_timesteps_tensor[0]
             self.alpha_prod_t_sqrt = self.alpha_prod_t_sqrt[0]
             self.beta_prod_t_sqrt = self.beta_prod_t_sqrt[0]
@@ -621,26 +531,14 @@ def prepare(
         self.c_out = self.c_out.to(self.device)
 
     def _get_scheduler_scalings(self, timestep):
-        """
-        Get LCM-specific scaling factors for boundary conditions.
-        Only used for LCMScheduler - other schedulers handle scaling in their step() method.
-        """
+        """Get LCM/TCD-specific scaling factors for boundary conditions."""
         if isinstance(self.scheduler, LCMScheduler):
             c_skip, c_out = self.scheduler.get_scalings_for_boundary_condition_discrete(timestep)
-            # Ensure returned values are tensors on the correct device
-            if not isinstance(c_skip, torch.Tensor):
-                c_skip = torch.tensor(c_skip, device=self.device, dtype=self.dtype)
-            else:
-                c_skip = c_skip.to(device=self.device, dtype=self.dtype)
-            if not isinstance(c_out, torch.Tensor):
-                c_out = torch.tensor(c_out, device=self.device, dtype=self.dtype)
-            else:
-                c_out = c_out.to(device=self.device, dtype=self.dtype)
             return c_skip, c_out
         else:
-            # For non-LCM schedulers, we don't use boundary condition scaling
-            # Their step() method handles all the necessary scaling internally
-            logger.debug(f"Scheduler {type(self.scheduler)} doesn't use boundary condition scaling")
+            # TCD and other schedulers don't use boundary condition scaling like LCM
+            # They handle scaling internally in their step() method
+            # Return tensors that are compatible with torch.stack()
             c_skip = torch.tensor(1.0, device=self.device, dtype=self.dtype)
             c_out = torch.tensor(1.0, device=self.device, dtype=self.dtype)
             return c_skip, c_out
@@ -664,18 +562,18 @@ def get_normalize_seed_weights(self) -> bool:
         """Get the current seed weight normalization setting."""
         return self._param_updater.get_normalize_seed_weights()
 
-    def set_scheduler(
-        self, 
-        scheduler: Literal["lcm", "tcd", "dpm++ 2m", "uni_pc", "ddim", "euler"] = None,
-        sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = None,
-    ) -> None:
+
+
+
+
+    def set_scheduler(self, scheduler: Literal["lcm", "tcd"] = None, sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = None) -> None:
         """
-        Change the scheduler and/or sampler configuration at runtime.
+        Change the scheduler and/or sampler at runtime.
         
         Parameters
         ----------
         scheduler : str, optional
-            The scheduler type to use. If None, keeps current scheduler.
+            The scheduler type to use ("lcm" or "tcd"). If None, keeps current scheduler.
         sampler : str, optional
             The sampler type to use. If None, keeps current sampler.
         """
@@ -684,35 +582,14 @@ def set_scheduler(
         if sampler is not None:
             self.sampler_type = sampler
             
-        # Re-initialize scheduler with new configuration
-        self.scheduler = self._initialize_scheduler(
-            self.scheduler_type, 
-            self.sampler_type, 
-            self.pipe.scheduler.config
-        )
-        
+        self.scheduler = self._initialize_scheduler(self.scheduler_type, self.sampler_type, self.pipe.scheduler.config)
         logger.info(f"Scheduler changed to {self.scheduler_type} with {self.sampler_type} sampler")
 
+    def _uses_lcm_logic(self) -> bool:
+        """Return True if scheduler uses LCM-style consistency boundary-condition math."""
+        return isinstance(self.scheduler, LCMScheduler)
 
 
-    def _uses_lcm_logic(self) -> bool:
-        """Return True if scheduler uses consistency boundary-condition math (LCM/TCD)."""
-        try:
-            # Use isinstance checks for more reliable detection
-            return isinstance(self.scheduler, LCMScheduler)
-        except Exception:
-            return False
-
-    def _warned_cfg_mode_fallback(self) -> bool:
-        return getattr(self, "_cfg_mode_warning_emitted", False)
-
-    def _emit_cfg_mode_warning_once(self) -> None:
-        if not self._warned_cfg_mode_fallback():
-            logger.warning(
-                "Non-LCM scheduler in use: falling back to standard CFG ('full') semantics. "
-                "Custom cfg_type values 'self'/'initialize' are ignored for correctness."
-            )
-            setattr(self, "_cfg_mode_warning_emitted", True)
 
     def add_noise(
         self,
@@ -732,37 +609,18 @@ def scheduler_step_batch(
         x_t_latent_batch: torch.Tensor,
         idx: Optional[int] = None,
     ) -> torch.Tensor:
-        """
-        Simplified scheduler integration that works with StreamDiffusion's architecture.
-        For now, we'll use a hybrid approach until we can properly refactor the pipeline.
-        """
-        # For LCM, use boundary condition scaling as before
-        if self._uses_lcm_logic():
-            if idx is None:
-                F_theta = (
-                    x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch
-                ) / self.alpha_prod_t_sqrt
-                denoised_batch = self.c_out * F_theta + self.c_skip * x_t_latent_batch
-            else:
-                F_theta = (
-                    x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
-                ) / self.alpha_prod_t_sqrt[idx]
-                denoised_batch = (
-                    self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
-                )
+        if idx is None:
+            F_theta = (
+                x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch
+            ) / self.alpha_prod_t_sqrt
+            denoised_batch = self.c_out * F_theta + self.c_skip * x_t_latent_batch
         else:
-            # For other schedulers, use simple epsilon denoising
-            # This is what works reliably with StreamDiffusion's current architecture
-            if idx is not None and idx < len(self.alpha_prod_t_sqrt):
-                denoised_batch = (
-                    x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
-                ) / self.alpha_prod_t_sqrt[idx]
-            else:
-                # Fallback to first timestep if idx is out of bounds
-                denoised_batch = (
-                    x_t_latent_batch - self.beta_prod_t_sqrt[0] * model_pred_batch
-                ) / self.alpha_prod_t_sqrt[0]
-
+            F_theta = (
+                x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
+            ) / self.alpha_prod_t_sqrt[idx]
+            denoised_batch = (
+                self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
+            )
         return denoised_batch
 
     def unet_step(
@@ -1039,20 +897,11 @@ def _unet_predict_noise_cfg(
         """
         Compute noise prediction from UNet with classifier-free guidance applied.
         This function does not apply any scheduler math; it only returns the guided noise.
-
-        For non-LCM schedulers, custom cfg_mode values 'self'/'initialize' are treated
-        as 'full' to ensure correctness with scheduler.step().
         """
-        effective_cfg = cfg_mode
-        if not self._uses_lcm_logic() and cfg_mode in ("self", "initialize"):
-            self._emit_cfg_mode_warning_once()
-            effective_cfg = "full"
-
         # Build latent batch for CFG
-        if self.guidance_scale > 1.0 and effective_cfg == "full":
+        if self.guidance_scale > 1.0 and cfg_mode == "full":
             latent_with_uc = torch.cat([latent_model_input, latent_model_input], dim=0)
-        elif self.guidance_scale > 1.0 and effective_cfg == "initialize":
-            # Keep initialize behavior for LCM only; if we reach here, LCM path
+        elif self.guidance_scale > 1.0 and cfg_mode == "initialize":
             latent_with_uc = torch.cat([latent_model_input[0:1], latent_model_input], dim=0)
         else:
             latent_with_uc = latent_model_input
@@ -1062,7 +911,7 @@ def _unet_predict_noise_cfg(
         if self.is_sdxl and hasattr(self, 'add_text_embeds') and hasattr(self, 'add_time_ids'):
             if self.add_text_embeds is not None and self.add_time_ids is not None:
                 batch_size = latent_with_uc.shape[0]
-                if self.guidance_scale > 1.0 and effective_cfg == "initialize":
+                if self.guidance_scale > 1.0 and cfg_mode == "initialize":
                     add_text_embeds = torch.cat([
                         self.add_text_embeds[0:1],
                         self.add_text_embeds[1:2].repeat(batch_size - 1, 1),
@@ -1071,7 +920,7 @@ def _unet_predict_noise_cfg(
                         self.add_time_ids[0:1],
                         self.add_time_ids[1:2].repeat(batch_size - 1, 1),
                     ], dim=0)
-                elif self.guidance_scale > 1.0 and effective_cfg == "full":
+                elif self.guidance_scale > 1.0 and cfg_mode == "full":
                     repeat_factor = batch_size // 2
                     add_text_embeds = self.add_text_embeds.repeat(repeat_factor, 1)
                     add_time_ids = self.add_time_ids.repeat(repeat_factor, 1)
@@ -1097,7 +946,7 @@ def _unet_predict_noise_cfg(
         )
 
         # Apply CFG
-        if self.guidance_scale > 1.0 and effective_cfg == "full":
+        if self.guidance_scale > 1.0 and cfg_mode == "full":
             noise_pred_uncond, noise_pred_text = model_pred.chunk(2)
             guided = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
             return guided
@@ -1128,23 +977,19 @@ def decode_image(self, x_0_pred_out: torch.Tensor) -> torch.Tensor:
 
     def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
         prev_latent_batch = self.x_t_latent_buffer
-
-        # LCM supports our denoising-batch trick. Other schedulers should use step() sequentially
-        if self.use_denoising_batch and self._uses_lcm_logic():
+        
+        # LCM supports our denoising-batch trick. TCD must use standard scheduler.step() sequentially
+        if self.use_denoising_batch and isinstance(self.scheduler, LCMScheduler):
             t_list = self.sub_timesteps_tensor
-            
             if self.denoising_steps_num > 1:
                 x_t_latent = torch.cat((x_t_latent, prev_latent_batch), dim=0)
-                
                 self.stock_noise = torch.cat(
                     (self.init_noise[0:1], self.stock_noise[:-1]), dim=0
                 )
-            
             x_0_pred_batch, model_pred = self.unet_step(x_t_latent, t_list)
 
             if self.denoising_steps_num > 1:
                 x_0_pred_out = x_0_pred_batch[-1].unsqueeze(0)
-                
                 if self.do_add_noise:
                     self.x_t_latent_buffer = (
                         self.alpha_prod_t_sqrt[1:] * x_0_pred_batch[:-1]
@@ -1158,7 +1003,7 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
                 x_0_pred_out = x_0_pred_batch
                 self.x_t_latent_buffer = None
         else:
-            # Standard scheduler loop using scale_model_input + scheduler.step()
+            # Standard scheduler loop for TCD and non-batched LCM
             sample = x_t_latent
             for idx, timestep in enumerate(self.sub_timesteps_tensor):
                 # Ensure timestep tensor on device with correct dtype
@@ -1167,28 +1012,44 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
                 else:
                     t = timestep.to(self.device)
 
-                # Scale model input per scheduler requirements
-                model_input = (
-                    self.scheduler.scale_model_input(sample, t)
-                    if hasattr(self.scheduler, "scale_model_input")
-                    else sample
-                )
+                # For TCD, use the scheduler's step method
+                if isinstance(self.scheduler, TCDScheduler):
+                    # Scale model input per scheduler requirements
+                    model_input = (
+                        self.scheduler.scale_model_input(sample, t)
+                        if hasattr(self.scheduler, "scale_model_input")
+                        else sample
+                    )
 
-                # Predict noise with CFG
-                noise_pred = self._unet_predict_noise_cfg(
-                    latent_model_input=model_input,
-                    timestep=t,
-                    cfg_mode=self.cfg_type,
-                )
+                    # Predict noise with CFG
+                    noise_pred = self._unet_predict_noise_cfg(
+                        latent_model_input=model_input,
+                        timestep=t,
+                        cfg_mode=self.cfg_type,
+                    )
 
-                # Advance one step
-                step_out = self.scheduler.step(noise_pred, t, sample)
-                # diffusers returns a SchedulerOutput; prefer .prev_sample if present
-                sample = getattr(step_out, "prev_sample", step_out[0] if isinstance(step_out, (tuple, list)) else step_out)
+                    # Advance one step using TCD's step method
+                    step_out = self.scheduler.step(noise_pred, t, sample)
+                    sample = getattr(step_out, "prev_sample", step_out[0] if isinstance(step_out, (tuple, list)) else step_out)
+                else:
+                    # Original LCM logic for non-batched mode
+                    t = t.view(1,).repeat(self.frame_bff_size,)
+                    x_0_pred, model_pred = self.unet_step(sample, t, idx)
+                    if idx < len(self.sub_timesteps_tensor) - 1:
+                        if self.do_add_noise:
+                            sample = self.alpha_prod_t_sqrt[
+                                idx + 1
+                            ] * x_0_pred + self.beta_prod_t_sqrt[
+                                idx + 1
+                            ] * torch.randn_like(
+                                x_0_pred, device=self.device, dtype=self.dtype
+                            )
+                        else:
+                            sample = self.alpha_prod_t_sqrt[idx + 1] * x_0_pred
+                    else:
+                        sample = x_0_pred
 
-            # After final step, sample approximates x0 latent
             x_0_pred_out = sample
-
         return x_0_pred_out
 
     @torch.no_grad()

From 977afb1efdea8373dce7039a74ac9257333bfffc Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 25 Aug 2025 15:20:38 -0400
Subject: [PATCH 04/16] Fix to lora engine setup, changed requirements in
 realtime-img2img for windows support.

---
 demo/realtime-img2img/requirements.txt | 8 ++++----
 src/streamdiffusion/wrapper.py         | 3 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/demo/realtime-img2img/requirements.txt b/demo/realtime-img2img/requirements.txt
index a379a58e..dd200a25 100644
--- a/demo/realtime-img2img/requirements.txt
+++ b/demo/realtime-img2img/requirements.txt
@@ -1,11 +1,11 @@
 diffusers==0.35.0
-transformers==4.56.0
-peft==0.18.0
+transformers==4.55.4
+peft==0.17.1
 accelerate==1.10.0
-huggingface_hub==0.35.0
+huggingface_hub==0.34.4
 fastapi==0.115.0
 uvicorn[standard]==0.32.0
-Pillow==10.5.0
+Pillow==10.4.0
 compel==2.0.2
 controlnet-aux==0.0.7
 xformers; sys_platform != 'darwin' or platform_machine != 'arm64'
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 5ff8a374..51b7fea1 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1113,6 +1113,7 @@ def _load_model(
                     mode=self.mode,
                     use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
+                    lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
                     ipadapter_tokens=ipadapter_tokens,
                     is_faceid=is_faceid if use_ipadapter_trt else None
@@ -1125,6 +1126,7 @@ def _load_model(
                     mode=self.mode,
                     use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
+                    lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
                     ipadapter_tokens=ipadapter_tokens,
                     is_faceid=is_faceid if use_ipadapter_trt else None
@@ -1137,6 +1139,7 @@ def _load_model(
                     mode=self.mode,
                     use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
+                    lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
                     ipadapter_tokens=ipadapter_tokens,
                     is_faceid=is_faceid if use_ipadapter_trt else None

From 0044a9b89a2e2e1663572ea848a438d8a3fd518d Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 25 Aug 2025 19:35:34 -0400
Subject: [PATCH 05/16] ControlNet TCD.

---
 src/streamdiffusion/pipeline.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index 62c77122..d15beb58 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -66,6 +66,7 @@ def __init__(
         # Force sequential processing for TCD
         if scheduler == "tcd":
             logger.info("TCD scheduler detected: Disabling denoising batch optimization for compatibility")
+            logger.info("TCD now supports ControlNet through proper hook processing")
             self.use_denoising_batch = False
             self.batch_size = frame_buffer_size
             self.trt_unet_batch_size = frame_buffer_size
@@ -979,6 +980,7 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
         prev_latent_batch = self.x_t_latent_buffer
         
         # LCM supports our denoising-batch trick. TCD must use standard scheduler.step() sequentially
+        # but now properly processes ControlNet hooks through unet_step()
         if self.use_denoising_batch and isinstance(self.scheduler, LCMScheduler):
             t_list = self.sub_timesteps_tensor
             if self.denoising_steps_num > 1:
@@ -1012,24 +1014,14 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
                 else:
                     t = timestep.to(self.device)
 
-                # For TCD, use the scheduler's step method
+                # For TCD, use the same UNet calling logic as LCM to ensure ControlNet hooks are processed
                 if isinstance(self.scheduler, TCDScheduler):
-                    # Scale model input per scheduler requirements
-                    model_input = (
-                        self.scheduler.scale_model_input(sample, t)
-                        if hasattr(self.scheduler, "scale_model_input")
-                        else sample
-                    )
-
-                    # Predict noise with CFG
-                    noise_pred = self._unet_predict_noise_cfg(
-                        latent_model_input=model_input,
-                        timestep=t,
-                        cfg_mode=self.cfg_type,
-                    )
-
-                    # Advance one step using TCD's step method
-                    step_out = self.scheduler.step(noise_pred, t, sample)
+                    # Use unet_step to process ControlNet hooks and get proper noise prediction
+                    t_expanded = t.view(1,).repeat(self.frame_bff_size,)
+                    x_0_pred, model_pred = self.unet_step(sample, t_expanded, idx)
+                    
+                    # Apply TCD scheduler step to the guided noise prediction
+                    step_out = self.scheduler.step(model_pred, t, sample)
                     sample = getattr(step_out, "prev_sample", step_out[0] if isinstance(step_out, (tuple, list)) else step_out)
                 else:
                     # Original LCM logic for non-batched mode

From 41e51229279e258077fbb2c1bd98a4d9a4eeef24 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 9 Sep 2025 20:30:56 -0400
Subject: [PATCH 06/16] At uvicorn quiet param to help debug issues without
 unncessary logging.

---
 demo/realtime-img2img/config.py | 9 +++++++++
 demo/realtime-img2img/main.py   | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/demo/realtime-img2img/config.py b/demo/realtime-img2img/config.py
index 8d74eda4..56d77404 100644
--- a/demo/realtime-img2img/config.py
+++ b/demo/realtime-img2img/config.py
@@ -20,6 +20,7 @@ class Args(NamedTuple):
     controlnet_config: str
     api_only: bool
     log_level: str
+    quiet: bool
 
     def pretty_print(self):
         print("\n")
@@ -34,6 +35,7 @@ def pretty_print(self):
 ENGINE_DIR = os.environ.get("ENGINE_DIR", "engines")
 ACCELERATION = os.environ.get("ACCELERATION", "xformers")
 LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
+QUIET = os.environ.get("QUIET", "False").lower() in ("true", "1", "yes", "on")
 
 default_host = os.getenv("HOST", "0.0.0.0")
 default_port = int(os.getenv("PORT", "7860"))
@@ -129,5 +131,12 @@ def pretty_print(self):
     choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
     help="Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
 )
+parser.add_argument(
+    "--quiet",
+    dest="quiet",
+    action="store_true",
+    default=QUIET,
+    help="Suppress uvicorn INFO messages (server access logs, etc.)",
+)
 config = Args(**vars(parser.parse_args()))
 config.pretty_print()
diff --git a/demo/realtime-img2img/main.py b/demo/realtime-img2img/main.py
index 4ede7a30..0c2ca2e4 100644
--- a/demo/realtime-img2img/main.py
+++ b/demo/realtime-img2img/main.py
@@ -97,6 +97,13 @@ def setup_logging(log_level: str = "INFO"):
 # Initialize logger
 logger = setup_logging(config.log_level)
 
+# Suppress uvicorn INFO messages
+if config.quiet:
+    uvicorn_logger = logging.getLogger('uvicorn')
+    uvicorn_logger.setLevel(logging.WARNING)
+    uvicorn_access_logger = logging.getLogger('uvicorn.access')
+    uvicorn_access_logger.setLevel(logging.WARNING)
+
 
 class App:
     def __init__(self, config: Args):

From b04f0e8c7804e9649cf60ab6056fa1f9b7dd65f9 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Thu, 11 Sep 2025 11:16:54 -0400
Subject: [PATCH 07/16] Fix to LoRA and IPAdapter conflict.

---
 src/streamdiffusion/wrapper.py | 220 +++++++++++++++++++++------------
 1 file changed, 139 insertions(+), 81 deletions(-)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 2612da3f..7dbf923d 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -7,7 +7,8 @@
 from PIL import Image
 import torchvision.transforms as T
 from torchvision.transforms import InterpolationMode
-from diffusers import AutoencoderTiny, StableDiffusionPipeline, StableDiffusionXLPipeline, AutoPipelineForText2Image
+from diffusers import AutoencoderTiny, StableDiffusionPipeline, StableDiffusionXLPipeline, AutoPipelineForText2Image, UNet2DConditionModel
+from safetensors.torch import load_file
 
 from .pipeline import StreamDiffusion
 from .model_detection import detect_model
@@ -1011,21 +1012,78 @@ def _load_model(
             scheduler=scheduler,
             sampler=sampler,
         )
+        # Load and properly merge LoRA weights using the standard diffusers approach
         if not self.sd_turbo:
+            lora_adapters_to_merge = []
+            lora_scales_to_merge = []
+            
+            # Collect all LoRA adapters and their scales
             if use_lcm_lora:
                 if lcm_lora_id is not None:
-                    stream.load_lcm_lora(
-                        pretrained_model_name_or_path_or_dict=lcm_lora_id
-                    )
+                    logger.info(f"_load_model: Loading LCM LoRA from {lcm_lora_id}")
+                    stream.pipe.load_lora_weights(lcm_lora_id, adapter_name="lcm_lora")
                 else:
-                    stream.load_lcm_lora()
-                stream.fuse_lora()
+                    logger.info("_load_model: Loading default LCM LoRA")
+                    # Use appropriate default LCM LoRA based on model type
+                    default_lcm_lora = "latent-consistency/lcm-lora-sdxl" if is_sdxl else "latent-consistency/lcm-lora-sdv1-5"
+                    stream.pipe.load_lora_weights(default_lcm_lora, adapter_name="lcm_lora")
+                
+                lora_adapters_to_merge.append("lcm_lora")
+                lora_scales_to_merge.append(1.0)
 
             if lora_dict is not None:
-                for lora_name, lora_scale in lora_dict.items():
+                for i, (lora_name, lora_scale) in enumerate(lora_dict.items()):
+                    adapter_name = f"custom_lora_{i}"
                     logger.info(f"_load_model: Loading LoRA '{lora_name}' with scale {lora_scale}")
-                    stream.load_lora(lora_name)
-                    stream.fuse_lora(lora_scale=lora_scale)
+                    
+                    try:
+                        # Load LoRA weights with unique adapter name
+                        stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name)
+                        lora_adapters_to_merge.append(adapter_name)
+                        lora_scales_to_merge.append(lora_scale)
+                        logger.info(f"Successfully loaded LoRA adapter: {adapter_name}")
+                    except Exception as e:
+                        logger.error(f"Failed to load LoRA {lora_name}: {e}")
+                        # Continue with other LoRAs even if one fails
+                        continue
+            
+            # Merge all LoRA adapters using the proper diffusers method
+            if lora_adapters_to_merge:
+                try:
+                    logger.info(f"Merging {len(lora_adapters_to_merge)} LoRA adapter(s) with scales: {lora_scales_to_merge}")
+                    
+                    # Use the proper merge_and_unload method from diffusers
+                    # This permanently merges LoRA weights into the base model parameters
+                    stream.pipe.fuse_lora(lora_scale=lora_scales_to_merge, adapter_names=lora_adapters_to_merge)
+                    
+                    # After fusing, unload the LoRA weights to clean up memory and avoid conflicts
+                    stream.pipe.unload_lora_weights()
+                    
+                    logger.info("Successfully merged and unloaded LoRA weights using diffusers merge_and_unload")
+                    
+                except Exception as e:
+                    logger.error(f"Failed to merge LoRA weights: {e}")
+                    logger.info("Attempting fallback: individual LoRA merging...")
+                    
+                    # Fallback: merge LoRAs individually
+                    try:
+                        for adapter_name, scale in zip(lora_adapters_to_merge, lora_scales_to_merge):
+                            logger.info(f"Merging individual LoRA: {adapter_name} with scale {scale}")
+                            stream.pipe.fuse_lora(lora_scale=scale, adapter_names=[adapter_name])
+                        
+                        # Clean up after individual merging
+                        stream.pipe.unload_lora_weights()
+                        logger.info("Successfully merged LoRAs individually")
+                        
+                    except Exception as fallback_error:
+                        logger.error(f"LoRA merging fallback also failed: {fallback_error}")
+                        logger.warning("Continuing without LoRA merging - LoRAs may not be applied correctly")
+                        
+                        # Clean up any partial state
+                        try:
+                            stream.pipe.unload_lora_weights()
+                        except:
+                            pass
 
         if use_tiny_vae:
             if vae_id is not None:
@@ -1034,7 +1092,6 @@ def _load_model(
                 # Use TAESD XL for SDXL models, regular TAESD for SD 1.5
                 taesd_model = "madebyollin/taesdxl" if is_sdxl else "madebyollin/taesd"
                 stream.vae = AutoencoderTiny.from_pretrained(taesd_model).to(dtype=pipe.dtype)
-    
 
         try:
             if acceleration == "xformers":
@@ -1243,10 +1300,15 @@ def _load_model(
                 except Exception:
                     pass
                 
-                # If using TensorRT with IP-Adapter, ensure processors and weights are installed BEFORE export
-                if use_ipadapter_trt and has_ipadapter and ipadapter_config and not hasattr(stream, '_ipadapter_module'):
+                # Note: LoRA weights have already been merged permanently during model loading
+                
+                # CRITICAL: Install IPAdapter module BEFORE TensorRT compilation to ensure processors are baked into engines
+                if use_ipadapter and ipadapter_config and not hasattr(stream, '_ipadapter_module'):
                     try:
                         from streamdiffusion.modules.ipadapter_module import IPAdapterModule, IPAdapterConfig
+                        logger.info("Installing IPAdapter module before TensorRT compilation...")
+                        
+                        # Use first config if list provided
                         cfg = ipadapter_config[0] if isinstance(ipadapter_config, list) else ipadapter_config
                         ip_cfg = IPAdapterConfig(
                             style_image_key=cfg.get('style_image_key') or 'ipadapter_main',
@@ -1258,17 +1320,28 @@ def _load_model(
                             is_faceid=(cfg.get('type') == 'faceid' or bool(cfg.get('is_faceid', False))),
                             insightface_model_name=cfg.get('insightface_model_name'),
                         )
-                        ip_module_for_export = IPAdapterModule(ip_cfg)
-                        ip_module_for_export.install(stream)
-                        setattr(stream, '_ipadapter_module', ip_module_for_export)
-                        try:
-                            logger.info("Installed IP-Adapter processors prior to TensorRT export")
-                        except Exception:
-                            pass
+                        ip_module = IPAdapterModule(ip_cfg)
+                        ip_module.install(stream)
+                        # Expose for later updates
+                        stream._ipadapter_module = ip_module
+                        logger.info("IPAdapter module installed successfully before TensorRT compilation")
+                        
+                        # Cleanup after IPAdapter installation
+                        import gc
+                        gc.collect()
+                        torch.cuda.empty_cache()
+                        torch.cuda.synchronize()
+                        
+                    except torch.cuda.OutOfMemoryError as oom_error:
+                        logger.error(f"CUDA Out of Memory during early IPAdapter installation: {oom_error}")
+                        logger.error("Try reducing batch size, using smaller models, or increasing GPU memory")
+                        raise RuntimeError("Insufficient VRAM for IPAdapter installation. Consider using a GPU with more memory or reducing model complexity.")
+                        
                     except Exception:
                         import traceback
                         traceback.print_exc()
-                        logger.error("Failed to pre-install IP-Adapter prior to TensorRT export")
+                        logger.error("Failed to install IPAdapterModule before TensorRT compilation")
+                        raise
 
                 # NOTE: When IPAdapter is enabled, we must pass num_ip_layers. We cannot know it until after
                 # installing processors in the export wrapper. We construct the wrapper first to discover it,
@@ -1492,46 +1565,47 @@ def _load_model(
                             logger.error(f"TensorRT VAE engine loading failed (non-OOM): {e}")
                             raise e
 
-            safety_checker_path = engine_manager.get_engine_path(
-                EngineType.SAFETY_CHECKER,
-                model_id_or_path=safety_checker_model_id,
-                max_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
-                min_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
-                mode=self.mode,
-                use_lcm_lora=use_lcm_lora,
-                use_tiny_vae=use_tiny_vae,
-            )
-            safety_checker_engine_exists = os.path.exists(safety_checker_path)
-
-            # Always load the safety checker if the engine exists. The model is really small and may be toggled later.
-            if self.use_safety_checker or safety_checker_engine_exists:
-                if not safety_checker_engine_exists:
-                    from transformers import AutoModelForImageClassification
-                    self.safety_checker = AutoModelForImageClassification.from_pretrained(safety_checker_model_id).to("cuda")
-
-                    safety_checker_model = NSFWDetector(
-                        device=self.device,
-                        max_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
-                        min_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
-                    )
+                # Safety checker engine (TensorRT-specific)
+                safety_checker_path = engine_manager.get_engine_path(
+                    EngineType.SAFETY_CHECKER,
+                    model_id_or_path=safety_checker_model_id,
+                    max_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
+                    min_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
+                    mode=self.mode,
+                    use_lcm_lora=use_lcm_lora,
+                    use_tiny_vae=use_tiny_vae,
+                )
+                safety_checker_engine_exists = os.path.exists(safety_checker_path)
+
+                # Always load the safety checker if the engine exists. The model is really small and may be toggled later.
+                if self.use_safety_checker or safety_checker_engine_exists:
+                    if not safety_checker_engine_exists:
+                        from transformers import AutoModelForImageClassification
+                        self.safety_checker = AutoModelForImageClassification.from_pretrained(safety_checker_model_id).to("cuda")
+
+                        safety_checker_model = NSFWDetector(
+                            device=self.device,
+                            max_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
+                            min_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
+                        )
 
-                    engine_manager.compile_and_load_engine(
-                        EngineType.SAFETY_CHECKER,
-                        safety_checker_path,
-                        model=self.safety_checker,
-                        model_config=safety_checker_model,
-                        batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
-                        cuda_stream=None,
-                        load_engine=load_engine,
-                    )
-                
-                if load_engine:
-                    self.safety_checker = NSFWDetectorEngine(
-                        safety_checker_path,
-                        cuda_stream,
-                        use_cuda_graph=True,
-                    )
+                        engine_manager.compile_and_load_engine(
+                            EngineType.SAFETY_CHECKER,
+                            safety_checker_path,
+                            model=self.safety_checker,
+                            model_config=safety_checker_model,
+                            batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
+                            cuda_stream=None,
+                            load_engine=load_engine,
+                        )
                     
+                    if load_engine:
+                        self.safety_checker = NSFWDetectorEngine(
+                            safety_checker_path,
+                            cuda_stream,
+                            use_cuda_graph=True,
+                        )
+                        
             if acceleration == "sfast":
                 from streamdiffusion.acceleration.sfast import (
                     accelerate_with_stable_fast,
@@ -1613,33 +1687,17 @@ def _load_model(
                 logger.error("Failed to install ControlNetModule")
                 raise
 
+        # IPAdapter module installation has been moved to before TensorRT compilation (see lines 1307-1345)
+        # This ensures processors are properly baked into the TensorRT engines
         if use_ipadapter and ipadapter_config and not hasattr(stream, '_ipadapter_module'):
-            try:
-                from streamdiffusion.modules.ipadapter_module import IPAdapterModule, IPAdapterConfig
-                # Use first config if list provided
-                cfg = ipadapter_config[0] if isinstance(ipadapter_config, list) else ipadapter_config
-                ip_cfg = IPAdapterConfig(
-                    style_image_key=cfg.get('style_image_key') or 'ipadapter_main',
-                    num_image_tokens=cfg.get('num_image_tokens', 4),
-                    ipadapter_model_path=cfg['ipadapter_model_path'],
-                    image_encoder_path=cfg['image_encoder_path'],
-                    style_image=cfg.get('style_image'),
-                    scale=cfg.get('scale', 1.0),
-                    is_faceid=(cfg.get('type') == 'faceid' or bool(cfg.get('is_faceid', False))),
-                    insightface_model_name=cfg.get('insightface_model_name'),
-                )
-                ip_module = IPAdapterModule(ip_cfg)
-                ip_module.install(stream)
-                # Expose for later updates
-                stream._ipadapter_module = ip_module
-            except Exception:
-                import traceback
-                traceback.print_exc()
-                logger.error("Failed to install IPAdapterModule")
-                raise
+            logger.warning("IPAdapter was not installed during TensorRT compilation phase - this may cause runtime issues")
+            logger.warning("IPAdapter should have been installed before engine compilation for proper TensorRT integration")
+
+        # Note: LoRA weights have already been merged permanently during model loading
 
         return stream
 
+
     def get_last_processed_image(self, index: int) -> Optional[Image.Image]:
         """Forward get_last_processed_image call to the underlying ControlNet pipeline"""
         if not self.use_controlnet:

From e2778b601ecdd6596c4a596a9842f7f00bc4e9fe Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 15 Sep 2025 19:56:29 -0400
Subject: [PATCH 08/16] Deprecation of use_lcm_lora.

---
 configs/sd15_multicontrol.yaml.example        |  7 +-
 configs/sdturbo_multicontrol.yaml.example     |  7 +-
 configs/sdxl_multicontrol.yaml.example        |  7 +-
 .../lib/components/PreprocessorDocs.svelte    |  1 -
 demo/realtime-img2img/img2img.py              |  1 -
 demo/realtime-img2img/main.py                 |  2 -
 demo/realtime-txt2img/config.py               |  3 +-
 demo/realtime-txt2img/main.py                 |  1 -
 examples/benchmark/multi.py                   |  6 +-
 examples/benchmark/single.py                  |  6 +-
 examples/optimal-performance/multi.py         |  1 -
 examples/optimal-performance/single.py        |  1 -
 .../acceleration/tensorrt/engine_manager.py   |  4 +-
 src/streamdiffusion/config.py                 |  2 -
 src/streamdiffusion/pipeline.py               | 15 ----
 src/streamdiffusion/wrapper.py                | 68 +++++++++----------
 16 files changed, 59 insertions(+), 73 deletions(-)

diff --git a/configs/sd15_multicontrol.yaml.example b/configs/sd15_multicontrol.yaml.example
index e95e55ef..6aa4b93d 100644
--- a/configs/sd15_multicontrol.yaml.example
+++ b/configs/sd15_multicontrol.yaml.example
@@ -32,7 +32,12 @@ seed: 789
 frame_buffer_size: 1
 delta: 0.7
 use_denoising_batch: true
-use_lcm_lora: true
+# LoRA configuration - use lora_dict to load LCM LoRA and other LoRAs
+lora_dict:
+  "latent-consistency/lcm-lora-sdv1-5": 1.0  # LCM LoRA for faster inference
+  # Add other LoRAs here:
+  # "your_custom_lora": 0.7
+
 use_tiny_vae: true
 acceleration: "tensorrt"   # "xformers" for non-TensorRT setups
 cfg_type: "self"
diff --git a/configs/sdturbo_multicontrol.yaml.example b/configs/sdturbo_multicontrol.yaml.example
index 5f7b8561..54a7b8a6 100644
--- a/configs/sdturbo_multicontrol.yaml.example
+++ b/configs/sdturbo_multicontrol.yaml.example
@@ -22,7 +22,12 @@ seed: 789
 frame_buffer_size: 1
 delta: 0.7
 use_denoising_batch: true
-use_lcm_lora: true          # SD-Turbo benefits from LCM LoRA
+# LoRA configuration - SD-Turbo can benefit from LCM LoRA
+lora_dict:
+  "latent-consistency/lcm-lora-sdv1-5": 1.0  # LCM LoRA for faster inference
+  # Add other LoRAs here:
+  # "your_custom_lora": 0.7
+
 use_tiny_vae: true
 acceleration: "tensorrt"    # "xformers" for non-TensorRT setups
 cfg_type: "self"
diff --git a/configs/sdxl_multicontrol.yaml.example b/configs/sdxl_multicontrol.yaml.example
index 441acce4..61c482de 100644
--- a/configs/sdxl_multicontrol.yaml.example
+++ b/configs/sdxl_multicontrol.yaml.example
@@ -31,7 +31,12 @@ seed: 42                    # Base seed (used with seed_blending above)
 frame_buffer_size: 1
 delta: 0.7
 use_denoising_batch: true
-use_lcm_lora: false         # SDXL has built-in optimizations
+# LoRA configuration - SDXL can use LCM LoRA for faster inference
+# lora_dict:
+#   "latent-consistency/lcm-lora-sdxl": 1.0  # Uncomment to enable LCM LoRA for SDXL
+#   # Add other LoRAs here:
+#   # "your_custom_lora": 0.7
+
 use_taesd: true             # Use Tiny AutoEncoder for SDXL
 use_tiny_vae: true
 acceleration: "tensorrt"    # "xformers" for non-TensorRT setups
diff --git a/demo/realtime-img2img/frontend/src/lib/components/PreprocessorDocs.svelte b/demo/realtime-img2img/frontend/src/lib/components/PreprocessorDocs.svelte
index 830666d7..0c59002b 100644
--- a/demo/realtime-img2img/frontend/src/lib/components/PreprocessorDocs.svelte
+++ b/demo/realtime-img2img/frontend/src/lib/components/PreprocessorDocs.svelte
@@ -40,7 +40,6 @@
       use_denoising_batch: true,
       delta: 0.7,
       frame_buffer_size: 1,
-      use_lcm_lora: true,
       use_tiny_vae: true,
       acceleration: "xformers",
       cfg_type: "self",
diff --git a/demo/realtime-img2img/img2img.py b/demo/realtime-img2img/img2img.py
index 54b802f1..8308d6b3 100644
--- a/demo/realtime-img2img/img2img.py
+++ b/demo/realtime-img2img/img2img.py
@@ -182,7 +182,6 @@ def __init__(self, args: Args, device: torch.device, torch_dtype: torch.dtype, w
                 frame_buffer_size=1,
                 width=params.width,
                 height=params.height,
-                use_lcm_lora=False,
                 output_type="pt",
                 warmup=10,
                 vae_id=None,
diff --git a/demo/realtime-img2img/main.py b/demo/realtime-img2img/main.py
index 7ac437d5..a3fb6368 100644
--- a/demo/realtime-img2img/main.py
+++ b/demo/realtime-img2img/main.py
@@ -625,7 +625,6 @@ async def settings():
                     'seed',
                     'frame_buffer_size',
                     'use_denoising_batch',
-                    'use_lcm_lora',
                     'use_tiny_vae',
                     'use_taesd',
                     'cfg_type',
@@ -761,7 +760,6 @@ async def upload_controlnet_config(file: UploadFile = File(...)):
                     'seed',
                     'frame_buffer_size',
                     'use_denoising_batch',
-                    'use_lcm_lora',
                     'use_tiny_vae',
                     'use_taesd',
                     'cfg_type',
diff --git a/demo/realtime-txt2img/config.py b/demo/realtime-txt2img/config.py
index c0a14ba4..35494148 100644
--- a/demo/realtime-txt2img/config.py
+++ b/demo/realtime-txt2img/config.py
@@ -29,8 +29,7 @@ class Config:
     model_id_or_path: str = os.environ.get("MODEL", "KBlueLeaf/kohaku-v2.1")
     # LoRA dictionary write like    field(default_factory=lambda: {'E:/stable-diffusion-webui/models/Lora_1.safetensors' : 1.0 , 'E:/stable-diffusion-webui/models/Lora_2.safetensors' : 0.2})
     lora_dict: dict = None
-    # LCM-LORA model
-    lcm_lora_id: str = os.environ.get("LORA", "latent-consistency/lcm-lora-sdv1-5")
+    # LCM-LORA model (use lora_dict instead of lcm_lora_id)
     # TinyVAE model
     vae_id: str = os.environ.get("VAE", "madebyollin/taesd")
     # Device to use
diff --git a/demo/realtime-txt2img/main.py b/demo/realtime-txt2img/main.py
index 88967c4c..18931f9e 100644
--- a/demo/realtime-txt2img/main.py
+++ b/demo/realtime-txt2img/main.py
@@ -63,7 +63,6 @@ def __init__(self, config: Config) -> None:
             mode=config.mode,
             model_id_or_path=config.model_id_or_path,
             lora_dict=config.lora_dict,
-            lcm_lora_id=config.lcm_lora_id,
             vae_id=config.vae_id,
             device=config.device,
             dtype=config.dtype,
diff --git a/examples/benchmark/multi.py b/examples/benchmark/multi.py
index bfa971cf..f3f879f2 100644
--- a/examples/benchmark/multi.py
+++ b/examples/benchmark/multi.py
@@ -40,7 +40,7 @@ def run(
     lora_dict: Optional[Dict[str, float]] = None,
     prompt: str = "1girl with brown dog hair, thick glasses, smiling",
     negative_prompt: str = "bad image , bad quality",
-    use_lcm_lora: bool = True,
+    lcm_lora: bool = True,
     use_tiny_vae: bool = True,
     width: int = 512,
     height: int = 512,
@@ -67,7 +67,7 @@ def run(
         The prompt to use, by default "1girl with brown dog hair, thick glasses, smiling".
     negative_prompt : str, optional
         The negative prompt to use, by default "bad image , bad quality".
-    use_lcm_lora : bool, optional
+    lcm_lora : bool, optional
         Whether to use LCM-LoRA or not, by default True.
     use_tiny_vae : bool, optional
         Whether to use TinyVAE or not, by default True.
@@ -97,7 +97,7 @@ def run(
         warmup=warmup,
         acceleration=acceleration,
         device_ids=device_ids,
-        use_lcm_lora=use_lcm_lora,
+        lora_dict={"latent-consistency/lcm-lora-sdv1-5": 1.0} if lcm_lora else lora_dict,
         use_tiny_vae=use_tiny_vae,
         enable_similar_image_filter=False,
         similar_image_filter_threshold=0.98,
diff --git a/examples/benchmark/single.py b/examples/benchmark/single.py
index 5e55fb63..eb868bdd 100644
--- a/examples/benchmark/single.py
+++ b/examples/benchmark/single.py
@@ -27,7 +27,7 @@ def run(
     lora_dict: Optional[Dict[str, float]] = None,
     prompt: str = "1girl with brown dog hair, thick glasses, smiling",
     negative_prompt: str = "bad image , bad quality",
-    use_lcm_lora: bool = True,
+    lcm_lora: bool = True,
     use_tiny_vae: bool = True,
     width: int = 512,
     height: int = 512,
@@ -54,7 +54,7 @@ def run(
         The prompt to use, by default "1girl with brown dog hair, thick glasses, smiling".
     negative_prompt : str, optional
         The negative prompt to use, by default "bad image , bad quality".
-    use_lcm_lora : bool, optional
+    lcm_lora : bool, optional
         Whether to use LCM-LoRA or not, by default True.
     use_tiny_vae : bool, optional
         Whether to use TinyVAE or not, by default True.
@@ -84,7 +84,7 @@ def run(
         warmup=warmup,
         acceleration=acceleration,
         device_ids=device_ids,
-        use_lcm_lora=use_lcm_lora,
+        lora_dict={"latent-consistency/lcm-lora-sdv1-5": 1.0} if lcm_lora else lora_dict,
         use_tiny_vae=use_tiny_vae,
         enable_similar_image_filter=False,
         similar_image_filter_threshold=0.98,
diff --git a/examples/optimal-performance/multi.py b/examples/optimal-performance/multi.py
index ac2c2a53..791d88b1 100644
--- a/examples/optimal-performance/multi.py
+++ b/examples/optimal-performance/multi.py
@@ -74,7 +74,6 @@ def image_generation_process(
         frame_buffer_size=batch_size,
         warmup=10,
         acceleration=acceleration,
-        use_lcm_lora=False,
         mode="txt2img",
         cfg_type="none",
         use_denoising_batch=True,
diff --git a/examples/optimal-performance/single.py b/examples/optimal-performance/single.py
index 4bc08b3f..a8020bb8 100644
--- a/examples/optimal-performance/single.py
+++ b/examples/optimal-performance/single.py
@@ -40,7 +40,6 @@ def image_generation_process(
         frame_buffer_size=1,
         warmup=10,
         acceleration=acceleration,
-        use_lcm_lora=False,
         mode="txt2img",
         cfg_type="none",
         use_denoising_batch=True,
diff --git a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
index f34e6d34..4f49bd43 100644
--- a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
+++ b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
@@ -99,7 +99,6 @@ def get_engine_path(self,
                        max_batch_size: int,
                        min_batch_size: int,
                        mode: str,
-                       use_lcm_lora: bool,
                        use_tiny_vae: bool,
                        lora_dict: Optional[Dict[str, float]] = None,
                        ipadapter_scale: Optional[float] = None,
@@ -132,7 +131,7 @@ def get_engine_path(self,
             base_name = maybe_path.stem if maybe_path.exists() else model_id_or_path
             
             # Create prefix (from wrapper.py lines 1005-1013)
-            prefix = f"{base_name}--lcm_lora-{use_lcm_lora}--tiny_vae-{use_tiny_vae}--min_batch-{min_batch_size}--max_batch-{max_batch_size}"
+            prefix = f"{base_name}--tiny_vae-{use_tiny_vae}--min_batch-{min_batch_size}--max_batch-{max_batch_size}"
             
             # IP-Adapter differentiation: add type and (optionally) tokens
             # Keep scale out of identity for runtime control, but include a type flag to separate caches
@@ -309,7 +308,6 @@ def get_or_load_controlnet_engine(self,
             max_batch_size=max_batch_size,
             min_batch_size=min_batch_size,
             mode="",  # Not used for ControlNet
-            use_lcm_lora=False,  # Not used for ControlNet
             use_tiny_vae=False,  # Not used for ControlNet
             controlnet_model_id=model_id
         )
diff --git a/src/streamdiffusion/config.py b/src/streamdiffusion/config.py
index 979a21cd..5c26a949 100644
--- a/src/streamdiffusion/config.py
+++ b/src/streamdiffusion/config.py
@@ -100,7 +100,6 @@ def _extract_wrapper_params(config: Dict[str, Any]) -> Dict[str, Any]:
         'lora_dict': config.get('lora_dict'),
         'mode': config.get('mode', 'img2img'),
         'output_type': config.get('output_type', 'pil'),
-        'lcm_lora_id': config.get('lcm_lora_id'),
         'vae_id': config.get('vae_id'),
         'device': config.get('device', 'cuda'),
         'dtype': _parse_dtype(config.get('dtype', 'float16')),
@@ -111,7 +110,6 @@ def _extract_wrapper_params(config: Dict[str, Any]) -> Dict[str, Any]:
         'acceleration': config.get('acceleration', 'tensorrt'),
         'do_add_noise': config.get('do_add_noise', True),
         'device_ids': config.get('device_ids'),
-        'use_lcm_lora': config.get('use_lcm_lora', True),
         'use_tiny_vae': config.get('use_tiny_vae', True),
         'enable_similar_image_filter': config.get('enable_similar_image_filter', False),
         'similar_image_filter_threshold': config.get('similar_image_filter_threshold', 0.98),
diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index bdde5998..26b12f71 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -231,21 +231,6 @@ def _build_sdxl_conditioning(self, batch_size: int) -> Dict[str, torch.Tensor]:
 
 
 
-    def load_lcm_lora(
-        self,
-        pretrained_model_name_or_path_or_dict: Union[
-            str, Dict[str, torch.Tensor]
-        ] = "latent-consistency/lcm-lora-sdv1-5",
-        adapter_name: Optional[Any] = None,
-        **kwargs,
-    ) -> None:
-        # Check for SDXL compatibility
-        if self.is_sdxl:
-            return
-            
-        self._load_lora_with_offline_fallback(
-            pretrained_model_name_or_path_or_dict, adapter_name, **kwargs
-        )
 
     def load_lora(
         self,
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index ae02c90e..d7c553dd 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -75,7 +75,6 @@ def __init__(
         lora_dict: Optional[Dict[str, float]] = None,
         mode: Literal["img2img", "txt2img"] = "img2img",
         output_type: Literal["pil", "pt", "np", "latent"] = "pil",
-        lcm_lora_id: Optional[str] = None,
         vae_id: Optional[str] = None,
         device: Literal["cpu", "cuda"] = "cuda",
         dtype: torch.dtype = torch.float16,
@@ -86,7 +85,7 @@ def __init__(
         acceleration: Literal["none", "xformers", "tensorrt"] = "tensorrt",
         do_add_noise: bool = True,
         device_ids: Optional[List[int]] = None,
-        use_lcm_lora: bool = True,
+        use_lcm_lora: Optional[bool] = None,  # Backwards compatibility parameter
         use_tiny_vae: bool = True,
         enable_similar_image_filter: bool = False,
         similar_image_filter_threshold: float = 0.98,
@@ -135,10 +134,6 @@ def __init__(
             txt2img or img2img, by default "img2img".
         output_type : Literal["pil", "pt", "np", "latent"], optional
             The output type of image, by default "pil".
-        lcm_lora_id : Optional[str], optional
-            The lcm_lora_id to load, by default None.
-            If None, the default LCM-LoRA
-            ("latent-consistency/lcm-lora-sdv1-5") will be used.
         vae_id : Optional[str], optional
             The vae_id to load, by default None.
             If None, the default TinyVAE
@@ -162,8 +157,6 @@ def __init__(
             by default True.
         device_ids : Optional[List[int]], optional
             The device ids to use for DataParallel, by default None.
-        use_lcm_lora : bool, optional
-            Whether to use LCM-LoRA or not, by default True.
         use_tiny_vae : bool, optional
             Whether to use TinyVAE or not, by default True.
         enable_similar_image_filter : bool, optional
@@ -208,6 +201,35 @@ def __init__(
         """
         if compile_engines_only:
             logger.info("compile_engines_only is True, will only compile engines and not load the model")
+        
+        # Handle backwards compatibility for use_lcm_lora parameter
+        if use_lcm_lora is not None:
+            logger.warning("use_lcm_lora parameter is deprecated. Use lora_dict instead.")
+            logger.warning("Automatically converting use_lcm_lora to lora_dict for backwards compatibility.")
+            
+            if use_lcm_lora and not self.sd_turbo:
+                # Initialize lora_dict if it doesn't exist
+                if lora_dict is None:
+                    lora_dict = {}
+                else:
+                    # Make a copy to avoid modifying the original
+                    lora_dict = lora_dict.copy()
+                
+                # Determine which LCM LoRA to use based on model path
+                model_path_lower = model_id_or_path.lower()
+                if any(indicator in model_path_lower for indicator in ['sdxl', 'xl', '1024']):
+                    lcm_lora_id = "latent-consistency/lcm-lora-sdxl"
+                    logger.info(f"Detected SDXL model, adding LCM LoRA: {lcm_lora_id}")
+                else:
+                    lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
+                    logger.info(f"Detected SD1.5 model, adding LCM LoRA: {lcm_lora_id}")
+                
+                # Add LCM LoRA to lora_dict if not already present
+                if lcm_lora_id not in lora_dict:
+                    lora_dict[lcm_lora_id] = 1.0
+                    logger.info(f"Added {lcm_lora_id} with scale 1.0 to lora_dict")
+                else:
+                    logger.info(f"LCM LoRA {lcm_lora_id} already present in lora_dict with scale {lora_dict[lcm_lora_id]}")
             
         self.sd_turbo = "turbo" in model_id_or_path
         self.use_controlnet = use_controlnet
@@ -258,12 +280,10 @@ def __init__(
         self.stream: StreamDiffusion = self._load_model(
             model_id_or_path=model_id_or_path,
             lora_dict=lora_dict,
-            lcm_lora_id=lcm_lora_id,
             vae_id=vae_id,
             t_index_list=t_index_list,
             acceleration=acceleration,
             do_add_noise=do_add_noise,
-            use_lcm_lora=use_lcm_lora,
             use_tiny_vae=use_tiny_vae,
             cfg_type=cfg_type,
             engine_dir=engine_dir,
@@ -884,11 +904,9 @@ def _load_model(
         model_id_or_path: str,
         t_index_list: List[int],
         lora_dict: Optional[Dict[str, float]] = None,
-        lcm_lora_id: Optional[str] = None,
         vae_id: Optional[str] = None,
         acceleration: Literal["none", "xformers", "tensorrt"] = "tensorrt",
         do_add_noise: bool = True,
-        use_lcm_lora: bool = True,
         use_tiny_vae: bool = True,
         cfg_type: Literal["none", "full", "self", "initialize"] = "self",
         engine_dir: Optional[Union[str, Path]] = "engines",
@@ -915,7 +933,7 @@ def _load_model(
         This method does the following:
 
         1. Loads the model from the model_id_or_path.
-        2. Loads and fuses the LCM-LoRA model from the lcm_lora_id if needed.
+        2. Loads and fuses LoRA models from lora_dict if provided.
         3. Loads the VAE model from the vae_id if needed.
         4. Enables acceleration if needed.
         5. Prepares the model for inference.
@@ -932,8 +950,7 @@ def _load_model(
             The lora_dict to load, by default None.
             Keys are the LoRA names and values are the LoRA scales.
             Example: {'LoRA_1' : 0.5 , 'LoRA_2' : 0.7 ,...}
-        lcm_lora_id : Optional[str], optional
-            The lcm_lora_id to load, by default None.
+            Use this to load LCM LoRA: {'latent-consistency/lcm-lora-sdv1-5': 1.0}
         vae_id : Optional[str], optional
             The vae_id to load, by default None.
         acceleration : Literal["none", "xfomers", "sfast", "tensorrt"], optional
@@ -943,8 +960,6 @@ def _load_model(
         do_add_noise : bool, optional
             Whether to add noise for following denoising steps or not,
             by default True.
-        use_lcm_lora : bool, optional
-            Whether to use LCM-LoRA or not, by default True.
         use_tiny_vae : bool, optional
             Whether to use TinyVAE or not, by default True.
         cfg_type : Literal["none", "full", "self", "initialize"],
@@ -1095,20 +1110,7 @@ def _load_model(
             lora_adapters_to_merge = []
             lora_scales_to_merge = []
             
-            # Collect all LoRA adapters and their scales
-            if use_lcm_lora:
-                if lcm_lora_id is not None:
-                    logger.info(f"_load_model: Loading LCM LoRA from {lcm_lora_id}")
-                    stream.pipe.load_lora_weights(lcm_lora_id, adapter_name="lcm_lora")
-                else:
-                    logger.info("_load_model: Loading default LCM LoRA")
-                    # Use appropriate default LCM LoRA based on model type
-                    default_lcm_lora = "latent-consistency/lcm-lora-sdxl" if is_sdxl else "latent-consistency/lcm-lora-sdv1-5"
-                    stream.pipe.load_lora_weights(default_lcm_lora, adapter_name="lcm_lora")
-                
-                lora_adapters_to_merge.append("lcm_lora")
-                lora_scales_to_merge.append(1.0)
-
+            # Collect all LoRA adapters and their scales from lora_dict
             if lora_dict is not None:
                 for i, (lora_name, lora_scale) in enumerate(lora_dict.items()):
                     adapter_name = f"custom_lora_{i}"
@@ -1298,7 +1300,6 @@ def _load_model(
                     max_batch_size=self.max_batch_size,
                     min_batch_size=self.min_batch_size,
                     mode=self.mode,
-                    use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
                     lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
@@ -1311,7 +1312,6 @@ def _load_model(
                     max_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
                     min_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
                     mode=self.mode,
-                    use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
                     lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
@@ -1324,7 +1324,6 @@ def _load_model(
                     max_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
                     min_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
                     mode=self.mode,
-                    use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
                     lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
@@ -1650,7 +1649,6 @@ def _load_model(
                     max_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
                     min_batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
                     mode=self.mode,
-                    use_lcm_lora=use_lcm_lora,
                     use_tiny_vae=use_tiny_vae,
                 )
                 safety_checker_engine_exists = os.path.exists(safety_checker_path)

From 53f7d92681097d36ab0c172b4f742375104f25ac Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 15 Sep 2025 20:07:19 -0400
Subject: [PATCH 09/16] Added backwards compatibility for use_lcm_lora.

---
 src/streamdiffusion/config.py  |  1 +
 src/streamdiffusion/wrapper.py | 61 ++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/src/streamdiffusion/config.py b/src/streamdiffusion/config.py
index 5c26a949..ac8b6f20 100644
--- a/src/streamdiffusion/config.py
+++ b/src/streamdiffusion/config.py
@@ -110,6 +110,7 @@ def _extract_wrapper_params(config: Dict[str, Any]) -> Dict[str, Any]:
         'acceleration': config.get('acceleration', 'tensorrt'),
         'do_add_noise': config.get('do_add_noise', True),
         'device_ids': config.get('device_ids'),
+        'use_lcm_lora': config.get('use_lcm_lora'),  # Backwards compatibility
         'use_tiny_vae': config.get('use_tiny_vae', True),
         'enable_similar_image_filter': config.get('enable_similar_image_filter', False),
         'similar_image_filter_threshold': config.get('similar_image_filter_threshold', 0.98),
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index d7c553dd..3d403097 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -157,6 +157,11 @@ def __init__(
             by default True.
         device_ids : Optional[List[int]], optional
             The device ids to use for DataParallel, by default None.
+        use_lcm_lora : Optional[bool], optional
+            DEPRECATED: Use lora_dict instead. For backwards compatibility only.
+            If True, automatically adds appropriate LCM LoRA to lora_dict based on model type.
+            SDXL models get "latent-consistency/lcm-lora-sdxl", others get "latent-consistency/lcm-lora-sdv1-5".
+            By default None (ignored).
         use_tiny_vae : bool, optional
             Whether to use TinyVAE or not, by default True.
         enable_similar_image_filter : bool, optional
@@ -202,35 +207,9 @@ def __init__(
         if compile_engines_only:
             logger.info("compile_engines_only is True, will only compile engines and not load the model")
         
-        # Handle backwards compatibility for use_lcm_lora parameter
-        if use_lcm_lora is not None:
-            logger.warning("use_lcm_lora parameter is deprecated. Use lora_dict instead.")
-            logger.warning("Automatically converting use_lcm_lora to lora_dict for backwards compatibility.")
-            
-            if use_lcm_lora and not self.sd_turbo:
-                # Initialize lora_dict if it doesn't exist
-                if lora_dict is None:
-                    lora_dict = {}
-                else:
-                    # Make a copy to avoid modifying the original
-                    lora_dict = lora_dict.copy()
-                
-                # Determine which LCM LoRA to use based on model path
-                model_path_lower = model_id_or_path.lower()
-                if any(indicator in model_path_lower for indicator in ['sdxl', 'xl', '1024']):
-                    lcm_lora_id = "latent-consistency/lcm-lora-sdxl"
-                    logger.info(f"Detected SDXL model, adding LCM LoRA: {lcm_lora_id}")
-                else:
-                    lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
-                    logger.info(f"Detected SD1.5 model, adding LCM LoRA: {lcm_lora_id}")
-                
-                # Add LCM LoRA to lora_dict if not already present
-                if lcm_lora_id not in lora_dict:
-                    lora_dict[lcm_lora_id] = 1.0
-                    logger.info(f"Added {lcm_lora_id} with scale 1.0 to lora_dict")
-                else:
-                    logger.info(f"LCM LoRA {lcm_lora_id} already present in lora_dict with scale {lora_dict[lcm_lora_id]}")
-            
+        # Store use_lcm_lora for backwards compatibility processing in _load_model
+        self.use_lcm_lora = use_lcm_lora
+
         self.sd_turbo = "turbo" in model_id_or_path
         self.use_controlnet = use_controlnet
         self.use_ipadapter = use_ipadapter
@@ -284,6 +263,7 @@ def __init__(
             t_index_list=t_index_list,
             acceleration=acceleration,
             do_add_noise=do_add_noise,
+            use_lcm_lora=use_lcm_lora, # Deprecated:Backwards compatibility
             use_tiny_vae=use_tiny_vae,
             cfg_type=cfg_type,
             engine_dir=engine_dir,
@@ -907,6 +887,7 @@ def _load_model(
         vae_id: Optional[str] = None,
         acceleration: Literal["none", "xformers", "tensorrt"] = "tensorrt",
         do_add_noise: bool = True,
+        use_lcm_lora: bool = True,
         use_tiny_vae: bool = True,
         cfg_type: Literal["none", "full", "self", "initialize"] = "self",
         engine_dir: Optional[Union[str, Path]] = "engines",
@@ -960,6 +941,8 @@ def _load_model(
         do_add_noise : bool, optional
             Whether to add noise for following denoising steps or not,
             by default True.
+        use_lcm_lora : bool, optional
+            Whether to use LCM-LoRA or not, by default True. # DEPRECATED: Backwards compatibility
         use_tiny_vae : bool, optional
             Whether to use TinyVAE or not, by default True.
         cfg_type : Literal["none", "full", "self", "initialize"],
@@ -1087,6 +1070,26 @@ def _load_model(
         self._is_sdxl = is_sdxl
         
         logger.info(f"_load_model: Detected model type: {model_type} (confidence: {confidence:.2f})")
+        
+        # DEPRECATED: THIS WILL LOAD LCM_LORA IF USE_LCM_LORA IS TRUE
+        # Validate backwards compatibility LCM LoRA selection using proper model detection
+        if hasattr(self, 'use_lcm_lora') and self.use_lcm_lora is not None:
+            if self.use_lcm_lora and not self.sd_turbo and lora_dict is not None:
+                # Determine correct LCM LoRA based on actual model detection
+                lcm_lora = "latent-consistency/lcm-lora-sdxl" if is_sdxl else "latent-consistency/lcm-lora-sdv1-5"
+
+                # Add to lora_dict if not already present
+                if lcm_lora not in lora_dict:
+                    lora_dict[lcm_lora] = 1.0
+                    logger.info(f"Added {lcm_lora} with scale 1.0 to lora_dict")
+                else:
+                    logger.info(f"LCM LoRA {lcm_lora} already present in lora_dict with scale {lora_dict[lcm_lora]}")
+            else:
+                logger.info(f"LCM LoRA will not be loaded because use_lcm_lora is {self.use_lcm_lora} and sd_turbo is {self.sd_turbo}")
+                
+                # Remove use_lcm_lora from self
+                self.use_lcm_lora = None
+                logger.info(f"use_lcm_lora has been removed from self")
 
         stream = StreamDiffusion(
             pipe=pipe,

From 55a20c924e8f5f1776d84b112e119d0f331d5da7 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 15 Sep 2025 20:14:39 -0400
Subject: [PATCH 10/16] Reverted single/multi scripts for simplicity.

---
 examples/benchmark/multi.py  | 6 +++---
 examples/benchmark/single.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/benchmark/multi.py b/examples/benchmark/multi.py
index f3f879f2..bfa971cf 100644
--- a/examples/benchmark/multi.py
+++ b/examples/benchmark/multi.py
@@ -40,7 +40,7 @@ def run(
     lora_dict: Optional[Dict[str, float]] = None,
     prompt: str = "1girl with brown dog hair, thick glasses, smiling",
     negative_prompt: str = "bad image , bad quality",
-    lcm_lora: bool = True,
+    use_lcm_lora: bool = True,
     use_tiny_vae: bool = True,
     width: int = 512,
     height: int = 512,
@@ -67,7 +67,7 @@ def run(
         The prompt to use, by default "1girl with brown dog hair, thick glasses, smiling".
     negative_prompt : str, optional
         The negative prompt to use, by default "bad image , bad quality".
-    lcm_lora : bool, optional
+    use_lcm_lora : bool, optional
         Whether to use LCM-LoRA or not, by default True.
     use_tiny_vae : bool, optional
         Whether to use TinyVAE or not, by default True.
@@ -97,7 +97,7 @@ def run(
         warmup=warmup,
         acceleration=acceleration,
         device_ids=device_ids,
-        lora_dict={"latent-consistency/lcm-lora-sdv1-5": 1.0} if lcm_lora else lora_dict,
+        use_lcm_lora=use_lcm_lora,
         use_tiny_vae=use_tiny_vae,
         enable_similar_image_filter=False,
         similar_image_filter_threshold=0.98,
diff --git a/examples/benchmark/single.py b/examples/benchmark/single.py
index eb868bdd..5e55fb63 100644
--- a/examples/benchmark/single.py
+++ b/examples/benchmark/single.py
@@ -27,7 +27,7 @@ def run(
     lora_dict: Optional[Dict[str, float]] = None,
     prompt: str = "1girl with brown dog hair, thick glasses, smiling",
     negative_prompt: str = "bad image , bad quality",
-    lcm_lora: bool = True,
+    use_lcm_lora: bool = True,
     use_tiny_vae: bool = True,
     width: int = 512,
     height: int = 512,
@@ -54,7 +54,7 @@ def run(
         The prompt to use, by default "1girl with brown dog hair, thick glasses, smiling".
     negative_prompt : str, optional
         The negative prompt to use, by default "bad image , bad quality".
-    lcm_lora : bool, optional
+    use_lcm_lora : bool, optional
         Whether to use LCM-LoRA or not, by default True.
     use_tiny_vae : bool, optional
         Whether to use TinyVAE or not, by default True.
@@ -84,7 +84,7 @@ def run(
         warmup=warmup,
         acceleration=acceleration,
         device_ids=device_ids,
-        lora_dict={"latent-consistency/lcm-lora-sdv1-5": 1.0} if lcm_lora else lora_dict,
+        use_lcm_lora=use_lcm_lora,
         use_tiny_vae=use_tiny_vae,
         enable_similar_image_filter=False,
         similar_image_filter_threshold=0.98,

From 123ba695e24e60c00cf244d9b051800c778e3660 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 15 Sep 2025 20:35:32 -0400
Subject: [PATCH 11/16] Updated descriptive comments, added tcd support, small
 cleanup/fixes.

---
 configs/sd15_multicontrol.yaml.example    |   4 +
 configs/sdturbo_multicontrol.yaml.example |   3 +
 configs/sdxl_multicontrol.yaml.example    |   4 +
 src/streamdiffusion/pipeline.py           |   2 +
 src/streamdiffusion/wrapper.py            | 165 +++++++++++++++++-----
 5 files changed, 144 insertions(+), 34 deletions(-)

diff --git a/configs/sd15_multicontrol.yaml.example b/configs/sd15_multicontrol.yaml.example
index 6aa4b93d..a5e865e1 100644
--- a/configs/sd15_multicontrol.yaml.example
+++ b/configs/sd15_multicontrol.yaml.example
@@ -42,12 +42,16 @@ use_tiny_vae: true
 acceleration: "tensorrt"   # "xformers" for non-TensorRT setups
 cfg_type: "self"
 
+scheduler: "lcm" # Supports "lcm" or "tcd"
+sampler: "normal"
+
 # Engine directory for TensorRT (engines will be built here if not found)
 engine_dir: "./engines/sd15"
 
 # Enable multi-modal conditioning
 use_controlnet: true
 use_ipadapter: true
+use_ipadapter: false
 
 # IPAdapter configuration for style conditioning
 ipadapters:
diff --git a/configs/sdturbo_multicontrol.yaml.example b/configs/sdturbo_multicontrol.yaml.example
index 54a7b8a6..a0fe0ce0 100644
--- a/configs/sdturbo_multicontrol.yaml.example
+++ b/configs/sdturbo_multicontrol.yaml.example
@@ -32,6 +32,9 @@ use_tiny_vae: true
 acceleration: "tensorrt"    # "xformers" for non-TensorRT setups
 cfg_type: "self"
 
+scheduler: "lcm" # Supports "lcm" or "tcd"
+sampler: "normal"
+
 # Engine directory for TensorRT
 engine_dir: "./engines/sdturbo"
 
diff --git a/configs/sdxl_multicontrol.yaml.example b/configs/sdxl_multicontrol.yaml.example
index 61c482de..0a39c415 100644
--- a/configs/sdxl_multicontrol.yaml.example
+++ b/configs/sdxl_multicontrol.yaml.example
@@ -41,6 +41,10 @@ use_taesd: true             # Use Tiny AutoEncoder for SDXL
 use_tiny_vae: true
 acceleration: "tensorrt"    # "xformers" for non-TensorRT setups
 cfg_type: "self"
+
+scheduler: "lcm" # Supports "lcm" or "tcd"
+sampler: "normal"
+
 safety_checker: false
 
 # Engine directory for TensorRT
diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index 26b12f71..ed38f9be 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -145,6 +145,8 @@ def __init__(
 
     def _initialize_scheduler(self, scheduler_type: str, sampler_type: str, config):
         """Initialize scheduler based on type and sampler configuration."""
+
+        # TODO: More testing and validation required on samplers.
         # Map sampler types to configuration parameters
         sampler_config = {
             "simple": {"timestep_spacing": "linspace"},
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 3d403097..1c915825 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -85,7 +85,7 @@ def __init__(
         acceleration: Literal["none", "xformers", "tensorrt"] = "tensorrt",
         do_add_noise: bool = True,
         device_ids: Optional[List[int]] = None,
-        use_lcm_lora: Optional[bool] = None,  # Backwards compatibility parameter
+        use_lcm_lora: Optional[bool] = None,  # DEPRECATED: Backwards compatibility parameter
         use_tiny_vae: bool = True,
         enable_similar_image_filter: bool = False,
         similar_image_filter_threshold: float = 0.98,
@@ -101,7 +101,7 @@ def __init__(
         normalize_prompt_weights: bool = True,
         normalize_seed_weights: bool = True,
         # Scheduler and sampler options
-        scheduler: Literal["lcm", "dpm++ 2m", "uni_pc", "ddim", "euler"] = "lcm",
+        scheduler: Literal["lcm", "tcd"] = "lcm",
         sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = "normal",
         # ControlNet options
         use_controlnet: bool = False,
@@ -126,6 +126,10 @@ def __init__(
             The model id or path to load.
         t_index_list : List[int]
             The t_index_list to use for inference.
+        min_batch_size : int, optional
+            The minimum batch size for inference, by default 1.
+        max_batch_size : int, optional
+            The maximum batch size for inference, by default 4.
         lora_dict : Optional[Dict[str, float]], optional
             The lora_dict to load, by default None.
             Keys are the LoRA names and values are the LoRA scales.
@@ -140,6 +144,8 @@ def __init__(
             ("madebyollin/taesd") will be used.
         device : Literal["cpu", "cuda"], optional
             The device to use for inference, by default "cuda".
+        device_ids : Optional[List[int]], optional
+            The device ids to use for DataParallel, by default None.
         dtype : torch.dtype, optional
             The dtype for inference, by default torch.float16.
         frame_buffer_size : int, optional
@@ -181,13 +187,19 @@ def __init__(
             The seed, by default 2.
         use_safety_checker : bool, optional
             Whether to use safety checker or not, by default False.
+        skip_diffusion : bool, optional
+            Whether to skip diffusion and apply only preprocessing/postprocessing hooks, by default False.
+        engine_dir : Optional[Union[str, Path]], optional
+            Directory path for storing/loading TensorRT engines, by default "engines".
+        build_engines_if_missing : bool, optional
+            Whether to build TensorRT engines if they don't exist, by default True.
         normalize_prompt_weights : bool, optional
             Whether to normalize prompt weights in blending to sum to 1,
             by default True. When False, weights > 1 will amplify embeddings.
         normalize_seed_weights : bool, optional
             Whether to normalize seed weights in blending to sum to 1,
             by default True. When False, weights > 1 will amplify noise.
-        scheduler : Literal["lcm", "dpm++ 2m", "uni_pc", "ddim", "euler"], optional
+        scheduler : Literal["lcm", "tcd"], optional
             The scheduler type to use for denoising, by default "lcm".
         sampler : Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"], optional
             The sampler type to use for noise scheduling, by default "normal".
@@ -197,6 +209,19 @@ def __init__(
             ControlNet configuration(s), by default None.
             Can be a single config dict or list of config dicts for multiple ControlNets.
             Each config should contain: model_id, preprocessor (optional), conditioning_scale, etc.
+        use_ipadapter : bool, optional
+            Whether to enable IPAdapter support, by default False.
+        ipadapter_config : Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional
+            IPAdapter configuration(s), by default None. Can be a single config dict
+            or list of config dicts for multiple IPAdapters.
+        image_preprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for image preprocessing hooks, by default None.
+        image_postprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for image postprocessing hooks, by default None.
+        latent_preprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for latent preprocessing hooks, by default None.
+        latent_postprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for latent postprocessing hooks, by default None.
         safety_checker_fallback_type : Literal["blank", "previous"], optional
             Whether to use a blank image or the previous image as a fallback, by default "previous".
         safety_checker_threshold: float, optional
@@ -809,33 +834,57 @@ def postprocess_image(
 
     def _denormalize_on_gpu(self, image_tensor: torch.Tensor) -> torch.Tensor:
         """
-        Denormalize image tensor on GPU for efficiency
+        Denormalize image tensor on GPU for efficiency.
 
+        Converts image tensor from diffusion range [-1, 1] to standard image range [0, 1].
 
-        Args:
-            image_tensor: Input tensor on GPU
-
+        Parameters
+        ----------
+        image_tensor : torch.Tensor
+            Input tensor in diffusion range [-1, 1], expected to be on GPU.
 
-        Returns:
-            Denormalized tensor on GPU, clamped to [0,1]
+        Returns
+        -------
+        torch.Tensor
+            Denormalized tensor in range [0, 1], clamped and on GPU.
         """
         return (image_tensor / 2 + 0.5).clamp(0, 1)
 
     def _normalize_on_gpu(self, image_tensor: torch.Tensor) -> torch.Tensor:
-        """Convert tensor from [0,1] (processor range) back to [-1,1] (diffusion range)"""
+        """
+        Normalize tensor from processor range to diffusion range.
+
+        Converts image tensor from standard image range [0, 1] to diffusion range [-1, 1].
+
+        Parameters
+        ----------
+        image_tensor : torch.Tensor
+            Input tensor in standard image range [0, 1], expected to be on GPU.
+
+        Returns
+        -------
+        torch.Tensor
+            Normalized tensor in diffusion range [-1, 1], clamped and on GPU.
+        """
         return (image_tensor * 2 - 1).clamp(-1, 1)
 
     def _tensor_to_pil_optimized(self, image_tensor: torch.Tensor) -> List[Image.Image]:
         """
-        Optimized tensor to PIL conversion with minimal CPU transfers
+        Optimized tensor to PIL conversion with minimal CPU transfers.
 
+        Efficiently converts a batch of GPU tensors to PIL Images with minimal
+        CPU-GPU transfers and memory allocations.
 
-        Args:
-            image_tensor: Input tensor on GPU
-
+        Parameters
+        ----------
+        image_tensor : torch.Tensor
+            Input tensor in diffusion range [-1, 1], expected to be on GPU.
+            Shape should be (batch_size, channels, height, width).
 
-        Returns:
-            List of PIL Images
+        Returns
+        -------
+        List[Image.Image]
+            List of PIL RGB images, one for each item in the batch.
         """
         # Denormalize on GPU first
         denormalized = self._denormalize_on_gpu(image_tensor)
@@ -873,6 +922,23 @@ def _tensor_to_pil_optimized(self, image_tensor: torch.Tensor) -> List[Image.Ima
         return pil_images
 
     def set_nsfw_fallback_img(self, height: int, width: int) -> None:
+        """
+        Set the NSFW fallback image used when safety checker blocks content.
+
+        Creates a black RGB image of the specified dimensions that will be returned
+        when the safety checker determines content should be blocked.
+
+        Parameters
+        ----------
+        height : int
+            Height of the fallback image in pixels.
+        width : int
+            Width of the fallback image in pixels.
+
+        Returns
+        -------
+        None
+        """
         self.nsfw_fallback_img = Image.new("RGB", (height, width), (0, 0, 0))
         if self.output_type == "pt":
             self.nsfw_fallback_img = torch.from_numpy(np.array(self.nsfw_fallback_img)).unsqueeze(0)
@@ -894,7 +960,7 @@ def _load_model(
         build_engines_if_missing: bool = True,
         normalize_prompt_weights: bool = True,
         normalize_seed_weights: bool = True,
-        scheduler: Literal["lcm", "dpm++ 2m", "uni_pc", "ddim", "euler"] = "lcm",
+        scheduler: Literal["lcm", "tcd"] = "lcm",
         sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = "normal",
         use_controlnet: bool = False,
         controlnet_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
@@ -924,41 +990,72 @@ def _load_model(
         Parameters
         ----------
         model_id_or_path : str
-            The model id or path to load.
+            The model id or path to load. Can be a Hugging Face model ID, local path to
+            safetensors/ckpt file, or directory containing model files.
         t_index_list : List[int]
-            The t_index_list to use for inference.
+            The t_index_list to use for inference. Specifies which denoising timesteps
+            to use from the diffusion schedule.
         lora_dict : Optional[Dict[str, float]], optional
             The lora_dict to load, by default None.
             Keys are the LoRA names and values are the LoRA scales.
             Example: {'LoRA_1' : 0.5 , 'LoRA_2' : 0.7 ,...}
             Use this to load LCM LoRA: {'latent-consistency/lcm-lora-sdv1-5': 1.0}
         vae_id : Optional[str], optional
-            The vae_id to load, by default None.
-        acceleration : Literal["none", "xfomers", "sfast", "tensorrt"], optional
-            The acceleration method, by default "tensorrt".
-        warmup : int, optional
-            The number of warmup steps to perform, by default 10.
+            The vae_id to load, by default None. If None, uses default TinyVAE
+            ("madebyollin/taesd" for SD1.5, "madebyollin/taesdxl" for SDXL).
+        acceleration : Literal["none", "xformers", "tensorrt"], optional
+            The acceleration method, by default "tensorrt". Note: docstring shows
+            "xfomers" and "sfast" but code uses "xformers".
         do_add_noise : bool, optional
             Whether to add noise for following denoising steps or not,
             by default True.
         use_lcm_lora : bool, optional
-            Whether to use LCM-LoRA or not, by default True. # DEPRECATED: Backwards compatibility
+            DEPRECATED: Use lora_dict instead. For backwards compatibility only.
+            If True, automatically adds appropriate LCM LoRA to lora_dict based on model type.
+            SDXL models get "latent-consistency/lcm-lora-sdxl", others get "latent-consistency/lcm-lora-sdv1-5".
+            By default None (ignored).
         use_tiny_vae : bool, optional
-            Whether to use TinyVAE or not, by default True.
-        cfg_type : Literal["none", "full", "self", "initialize"],
-        optional
+            Whether to use TinyVAE or not, by default True. TinyVAE is a distilled,
+            smaller VAE model that provides faster encoding/decoding with minimal quality loss.
+        cfg_type : Literal["none", "full", "self", "initialize"], optional
             The cfg_type for img2img mode, by default "self".
             You cannot use anything other than "none" for txt2img mode.
-        seed : int, optional
-            The seed, by default 2.
+        engine_dir : Optional[Union[str, Path]], optional
+            Directory path for storing/loading TensorRT engines, by default "engines".
+        build_engines_if_missing : bool, optional
+            Whether to build TensorRT engines if they don't exist, by default True.
+        normalize_prompt_weights : bool, optional
+            Whether to normalize prompt weights in blending to sum to 1, by default True.
+            When False, weights > 1 will amplify embeddings.
+        normalize_seed_weights : bool, optional
+            Whether to normalize seed weights in blending to sum to 1, by default True.
+            When False, weights > 1 will amplify noise.
+        scheduler : Literal["lcm", "tcd"], optional
+            The scheduler type to use for denoising, by default "lcm".
+        sampler : Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"], optional
+            The sampler type to use for noise scheduling, by default "normal".
         use_controlnet : bool, optional
-            Whether to apply ControlNet patch, by default False.
+            Whether to enable ControlNet support, by default False.
         controlnet_config : Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional
-            ControlNet configuration(s), by default None.
+            ControlNet configuration(s), by default None. Can be a single config dict
+            or list of config dicts for multiple ControlNets.
         use_ipadapter : bool, optional
-            Whether to apply IPAdapter patch, by default False.
+            Whether to enable IPAdapter support, by default False.
         ipadapter_config : Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional
-            IPAdapter configuration(s), by default None.
+            IPAdapter configuration(s), by default None. Can be a single config dict
+            or list of config dicts for multiple IPAdapters.
+        image_preprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for image preprocessing hooks, by default None.
+        image_postprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for image postprocessing hooks, by default None.
+        latent_preprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for latent preprocessing hooks, by default None.
+        latent_postprocessing_config : Optional[Dict[str, Any]], optional
+            Configuration for latent postprocessing hooks, by default None.
+        safety_checker_model_id : Optional[str], optional
+            Model ID for the safety checker, by default "Freepik/nsfw_image_detector".
+        compile_engines_only : bool, optional
+            Whether to only compile engines and not load the model, by default False.
 
         Returns
         -------

From 312811c5aead0ccddadb796f7d3091f334096183 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 15 Sep 2025 20:42:41 -0400
Subject: [PATCH 12/16] Oops.

---
 configs/sd15_multicontrol.yaml.example | 1 -
 1 file changed, 1 deletion(-)

diff --git a/configs/sd15_multicontrol.yaml.example b/configs/sd15_multicontrol.yaml.example
index a5e865e1..65f36748 100644
--- a/configs/sd15_multicontrol.yaml.example
+++ b/configs/sd15_multicontrol.yaml.example
@@ -51,7 +51,6 @@ engine_dir: "./engines/sd15"
 # Enable multi-modal conditioning
 use_controlnet: true
 use_ipadapter: true
-use_ipadapter: false
 
 # IPAdapter configuration for style conditioning
 ipadapters:

From 54f054642a76942010c467520d58d5e9757fc2cf Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 15 Sep 2025 21:20:03 -0400
Subject: [PATCH 13/16] Fix for potential xformers issue.

---
 src/streamdiffusion/wrapper.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 1c915825..973a5854 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1149,6 +1149,11 @@ def _load_model(
                 pipe.text_encoder = pipe.text_encoder.to(device=self.device)
             if hasattr(pipe, "text_encoder_2") and pipe.text_encoder_2 is not None:
                 pipe.text_encoder_2 = pipe.text_encoder_2.to(device=self.device)
+            # Move main pipeline components to device, but skip UNet for TensorRT
+            if hasattr(pipe, "unet") and pipe.unet is not None and acceleration != "tensorrt":
+                pipe.unet = pipe.unet.to(device=self.device)
+            if hasattr(pipe, "vae") and pipe.vae is not None and acceleration != "tensorrt":
+                pipe.vae = pipe.vae.to(device=self.device)
 
         # If we get here, the model loaded successfully - break out of retry loop
         logger.info(f"Model loading succeeded")
@@ -1267,11 +1272,15 @@ def _load_model(
 
         if use_tiny_vae:
             if vae_id is not None:
-                stream.vae = AutoencoderTiny.from_pretrained(vae_id).to(dtype=pipe.dtype)
+                stream.vae = AutoencoderTiny.from_pretrained(vae_id).to(dtype=pipe.dtype, device=self.device)
             else:
                 # Use TAESD XL for SDXL models, regular TAESD for SD 1.5
                 taesd_model = "madebyollin/taesdxl" if is_sdxl else "madebyollin/taesd"
-                stream.vae = AutoencoderTiny.from_pretrained(taesd_model).to(dtype=pipe.dtype)
+                stream.vae = AutoencoderTiny.from_pretrained(taesd_model).to(dtype=pipe.dtype, device=self.device)
+        elif acceleration != "tensorrt":
+            # For non-TensorRT acceleration, ensure VAE is on device if it wasn't moved earlier
+            if hasattr(pipe, "vae") and pipe.vae is not None:
+                pipe.vae = pipe.vae.to(device=self.device)
 
         try:
             if acceleration == "xformers":
@@ -1920,7 +1929,6 @@ def _load_model(
 
         return stream
 
-
     def get_last_processed_image(self, index: int) -> Optional[Image.Image]:
         """Forward get_last_processed_image call to the underlying ControlNet pipeline"""
         if not self.use_controlnet:
@@ -1945,14 +1953,12 @@ def update_control_image(self, index: int, image: Union[str, Image.Image, torch.
         else:
             logger.debug("update_control_image: Skipping ControlNet update in skip diffusion mode")
 
-
     def update_style_image(self, image: Union[str, Image.Image, torch.Tensor]) -> None:
         """Update IPAdapter style image"""
         if not self.use_ipadapter:
             raise RuntimeError("update_style_image: IPAdapter support not enabled. Set use_ipadapter=True in constructor.")
         self.stream.update_style_image(image)
         
-
     def clear_caches(self) -> None:
         """Clear all cached prompt embeddings and seed noise tensors."""
         self.stream._param_updater.clear_caches()

From 7e210eadd91143bcc14891909c802707eda99fbd Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 16 Sep 2025 16:31:58 -0400
Subject: [PATCH 14/16] Fix to TCD update params.

---
 src/streamdiffusion/stream_parameter_updater.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/streamdiffusion/stream_parameter_updater.py b/src/streamdiffusion/stream_parameter_updater.py
index 81acc89f..0a0bade3 100644
--- a/src/streamdiffusion/stream_parameter_updater.py
+++ b/src/streamdiffusion/stream_parameter_updater.py
@@ -674,6 +674,20 @@ def _update_seed(self, seed: int) -> None:
         # Reset stock_noise to match the new init_noise
         self.stream.stock_noise = torch.zeros_like(self.stream.init_noise)
 
+    def _get_scheduler_scalings(self, timestep):
+        """Get LCM/TCD-specific scaling factors for boundary conditions."""
+        from diffusers import LCMScheduler
+        if isinstance(self.stream.scheduler, LCMScheduler):
+            c_skip, c_out = self.stream.scheduler.get_scalings_for_boundary_condition_discrete(timestep)
+            return c_skip, c_out
+        else:
+            # TCD and other schedulers don't use boundary condition scaling like LCM
+            # They handle scaling internally in their step() method
+            # Return tensors that are compatible with torch.stack()
+            c_skip = torch.tensor(1.0, device=self.stream.device, dtype=self.stream.dtype)
+            c_out = torch.tensor(1.0, device=self.stream.device, dtype=self.stream.dtype)
+            return c_skip, c_out
+
     def _update_timestep_calculations(self) -> None:
         """Update timestep-dependent calculations based on current t_list."""
         self.stream.sub_timesteps = []
@@ -692,7 +706,7 @@ def _update_timestep_calculations(self) -> None:
         c_skip_list = []
         c_out_list = []
         for timestep in self.stream.sub_timesteps:
-            c_skip, c_out = self.stream.scheduler.get_scalings_for_boundary_condition_discrete(timestep)
+            c_skip, c_out = self._get_scheduler_scalings(timestep)
             c_skip_list.append(c_skip)
             c_out_list.append(c_out)
 

From a0779f4a5ce18ada8373364c396f00ddf7c9eb29 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Wed, 17 Sep 2025 13:45:05 -0400
Subject: [PATCH 15/16] Removal of old fuse method.

---
 src/streamdiffusion/wrapper.py | 96 ++++++++++++++--------------------
 1 file changed, 40 insertions(+), 56 deletions(-)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 973a5854..2eb83a9f 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1210,65 +1210,49 @@ def _load_model(
             scheduler=scheduler,
             sampler=sampler,
         )
+
+        
         # Load and properly merge LoRA weights using the standard diffusers approach
-        if not self.sd_turbo:
-            lora_adapters_to_merge = []
-            lora_scales_to_merge = []
-            
-            # Collect all LoRA adapters and their scales from lora_dict
-            if lora_dict is not None:
-                for i, (lora_name, lora_scale) in enumerate(lora_dict.items()):
-                    adapter_name = f"custom_lora_{i}"
-                    logger.info(f"_load_model: Loading LoRA '{lora_name}' with scale {lora_scale}")
-                    
-                    try:
-                        # Load LoRA weights with unique adapter name
-                        stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name)
-                        lora_adapters_to_merge.append(adapter_name)
-                        lora_scales_to_merge.append(lora_scale)
-                        logger.info(f"Successfully loaded LoRA adapter: {adapter_name}")
-                    except Exception as e:
-                        logger.error(f"Failed to load LoRA {lora_name}: {e}")
-                        # Continue with other LoRAs even if one fails
-                        continue
-            
-            # Merge all LoRA adapters using the proper diffusers method
-            if lora_adapters_to_merge:
+        lora_adapters_to_merge = []
+        lora_scales_to_merge = []
+        
+        # Collect all LoRA adapters and their scales from lora_dict
+        if lora_dict is not None:
+            for i, (lora_name, lora_scale) in enumerate(lora_dict.items()):
+                adapter_name = f"custom_lora_{i}"
+                logger.info(f"_load_model: Loading LoRA '{lora_name}' with scale {lora_scale}")
+                
                 try:
-                    logger.info(f"Merging {len(lora_adapters_to_merge)} LoRA adapter(s) with scales: {lora_scales_to_merge}")
-                    
-                    # Use the proper merge_and_unload method from diffusers
-                    # This permanently merges LoRA weights into the base model parameters
-                    stream.pipe.fuse_lora(lora_scale=lora_scales_to_merge, adapter_names=lora_adapters_to_merge)
-                    
-                    # After fusing, unload the LoRA weights to clean up memory and avoid conflicts
-                    stream.pipe.unload_lora_weights()
-                    
-                    logger.info("Successfully merged and unloaded LoRA weights using diffusers merge_and_unload")
-                    
+                    # Load LoRA weights with unique adapter name
+                    stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name)
+                    lora_adapters_to_merge.append(adapter_name)
+                    lora_scales_to_merge.append(lora_scale)
+                    logger.info(f"Successfully loaded LoRA adapter: {adapter_name}")
                 except Exception as e:
-                    logger.error(f"Failed to merge LoRA weights: {e}")
-                    logger.info("Attempting fallback: individual LoRA merging...")
-                    
-                    # Fallback: merge LoRAs individually
-                    try:
-                        for adapter_name, scale in zip(lora_adapters_to_merge, lora_scales_to_merge):
-                            logger.info(f"Merging individual LoRA: {adapter_name} with scale {scale}")
-                            stream.pipe.fuse_lora(lora_scale=scale, adapter_names=[adapter_name])
-                        
-                        # Clean up after individual merging
-                        stream.pipe.unload_lora_weights()
-                        logger.info("Successfully merged LoRAs individually")
-                        
-                    except Exception as fallback_error:
-                        logger.error(f"LoRA merging fallback also failed: {fallback_error}")
-                        logger.warning("Continuing without LoRA merging - LoRAs may not be applied correctly")
-                        
-                        # Clean up any partial state
-                        try:
-                            stream.pipe.unload_lora_weights()
-                        except:
-                            pass
+                    logger.error(f"Failed to load LoRA {lora_name}: {e}")
+                    # Continue with other LoRAs even if one fails
+                    continue
+        
+        # Merge all LoRA adapters using the proper diffusers method
+        if lora_adapters_to_merge:
+            try:
+                for adapter_name, scale in zip(lora_adapters_to_merge, lora_scales_to_merge):
+                    logger.info(f"Merging individual LoRA: {adapter_name} with scale {scale}")
+                    stream.pipe.fuse_lora(lora_scale=scale, adapter_names=[adapter_name])
+                
+                # Clean up after individual merging
+                stream.pipe.unload_lora_weights()
+                logger.info("Successfully merged LoRAs individually")
+                
+            except Exception as fallback_error:
+                logger.error(f"LoRA merging fallback also failed: {fallback_error}")
+                logger.warning("Continuing without LoRA merging - LoRAs may not be applied correctly")
+                
+                # Clean up any partial state
+                try:
+                    stream.pipe.unload_lora_weights()
+                except:
+                    pass
 
         if use_tiny_vae:
             if vae_id is not None:

From 45c79d124eb2d5b2cb5b358c38813eec0d60a607 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 23 Sep 2025 10:40:41 -0400
Subject: [PATCH 16/16] Pushed multi-test scripts.

---
 multi_test/README.md              |  716 ++++++++++
 multi_test/enhanced_video_wall.py |  515 +++++++
 multi_test/multi_test.py          | 2076 +++++++++++++++++++++++++++++
 multi_test/prompts.txt            |    6 +
 4 files changed, 3313 insertions(+)
 create mode 100644 multi_test/README.md
 create mode 100644 multi_test/enhanced_video_wall.py
 create mode 100644 multi_test/multi_test.py
 create mode 100644 multi_test/prompts.txt

diff --git a/multi_test/README.md b/multi_test/README.md
new file mode 100644
index 00000000..cbc75590
--- /dev/null
+++ b/multi_test/README.md
@@ -0,0 +1,716 @@
+# StreamDiffusion Multi-Config Test Suite
+
+This testing suite allows you to benchmark multiple StreamDiffusion configurations against multiple video files, providing comprehensive performance analysis and reports.
+
+## Features
+
+- **Multi-Config Testing**: Test multiple YAML configuration files against multiple video files
+- **Resume Functionality**: Continue processing from where you left off after interruptions
+- **Individual Prompt Processing**: Process each prompt individually and merge results into combined videos
+- **RAM-Based Frame Processing**: Loads all frames into RAM for maximum speed (similar to main.py)
+- **RAM-Based Video Creation**: Creates MP4 videos directly from frames in memory (no disk I/O)
+- **Automatic Frame Extraction**: Uses ffmpeg to extract frames from videos for processing
+- **Framerate Matching**: Output videos maintain the same framerate and timing as input videos
+- **Full ControlNet Support**: Automatically loads and configures ControlNets from YAML configs
+- **Performance Metrics**: Measures FPS, frame processing times, and success rates
+- **Comprehensive Reporting**: Generates multiple report formats (TXT, CSV, JSON)
+- **Error Handling**: Gracefully handles failures and continues with remaining tests
+- **Resource Management**: Automatically cleans up temporary files and RAM cache
+- **Video Merging**: Combines output from multiple prompts into single merged videos
+- **JSON Metadata Generation**: Creates detailed JSON files alongside output videos containing configuration details, performance metrics, and processing information for comprehensive analysis and resume support
+- **Enhanced Video Wall**: Automatically generates a video wall with rich metadata overlays (config, model, FPS, etc.) using ffmpeg, supporting flipped layouts (videos as rows, configs as columns) with fallback to basic wall
+- **Retry Failed Combinations**: Supports retrying previously failed config+video pairs during resume without reprocessing successful ones, preserving all prior results
+- **Advanced Performance Metrics**: Includes coefficient of variation (CV) for FPS stability, segment-level FPS analysis, and detailed rankings/recommendations in reports for better optimization insights
+
+## Performance Optimizations
+
+The test suite is optimized for maximum speed by:
+
+1. **RAM-Based Processing**: All video frames are loaded into RAM once and reused across multiple prompts/configs
+2. **RAM-Based Video Creation**: Creates MP4 videos directly from frames in memory (no temporary files)
+3. **Frame Caching**: Frames are cached in memory to avoid reloading from disk
+4. **Minimal Disk I/O**: Processing happens entirely in memory, with disk writes only for final output
+5. **Efficient Memory Management**: Automatic cleanup of frame cache to prevent memory issues
+6. **Batch Processing**: Multiple prompts are processed against the same frames without reloading
+7. **Framerate Preservation**: Output videos maintain input video timing for seamless playback
+8. **Dual Video Encoding**: Uses imageio (primary) + OpenCV (fallback) for maximum compatibility
+
+This approach makes the test suite run significantly faster than disk-based alternatives, similar to the real-time processing in `main.py`.
+
+## Requirements
+
+- Python 3.7+
+- StreamDiffusion installed
+- ffmpeg-python (for enhanced video wall creation with metadata overlays; install with `pip install ffmpeg-python`)
+- ffmpeg (for video frame extraction only)
+- PyYAML
+- PIL/Pillow
+- **Video Creation Dependencies**:
+  - `imageio` (primary video creation)
+  - `imageio-ffmpeg` (for H.264 encoding)
+  - `opencv-python` (fallback video creation)
+- Sufficient RAM to hold all video frames (typically 2-4GB per video depending on resolution)
+
+## Installation
+
+1. Install the core dependencies:
+```bash
+pip install pyyaml pillow
+```
+
+2. Install ffmpeg (for frame extraction only):
+   - **Windows**: Download from https://ffmpeg.org/download.html
+   - **macOS**: `brew install ffmpeg`
+   - **Linux**: `sudo apt install ffmpeg` or equivalent
+
+3. Make sure StreamDiffusion is properly installed and accessible
+
+## Usage
+
+### Basic Usage
+
+```bash
+# Test with config prompts (original behavior)
+python multi_test.py --configs ./myconfigdir --videos ./myinputvideos
+
+# Test with individual prompts from file
+python multi_test.py --configs ./myconfigdir --videos ./myinputvideos --prompts ./my_prompts.txt
+```
+
+### Command Line Options
+
+- `--configs`: Directory containing YAML configuration files
+- `--videos`: Directory containing video files
+- `--output`: Output directory for results (default: `./output-test`)
+- `--prompts`: Text file containing individual prompts (one per line, optional)
+- `--timeout_seconds`: Maximum time to spend processing each video (default: 300)
+- `--resume`: Resume from existing output directory (full path to directory)
+- `--retry_failed`: Retry previously failed combinations during resume (default: false)
+
+### Memory Management Options
+
+For videos that cause CUDA out-of-memory errors, use these options:
+
+```bash
+# Process fewer frames per batch to reduce memory usage
+python multi_test.py --configs ./configs --videos ./videos --batch-size 5
+
+# Lower memory threshold for more aggressive cleanup
+python multi_test.py --configs ./configs --videos ./videos --memory-threshold 1.0
+
+# Process every 2nd frame for very long videos (reduces processing time and memory)
+python multi_test.py --configs ./configs --videos ./videos --frame-skip 2
+
+# Combine all memory management options
+python multi_test.py --configs ./configs --videos ./videos --batch-size 5 --memory-threshold 1.0 --frame-skip 2
+```
+
+### Example
+
+```bash
+# Test all configs in ./configs against all videos in ./videos
+python multi_test.py --configs ./configs --videos ./videos --output ./benchmark_results
+
+# Test with individual prompts
+python multi_test.py --configs ./configs --videos ./videos --prompts ./prompts.txt --output ./prompt_results
+
+# Test with custom output directory
+python multi_test.py --configs ./my_configs --videos ./my_videos --output ./my_results
+```
+
+## Resume Functionality
+
+The test suite now supports resuming interrupted runs, allowing you to continue processing from where you left off without losing previous work.
+
+### How Resume Works
+
+1. **Automatic Detection**: Scans existing output directory for completed videos and JSON metadata files
+2. **Smart Parsing**: Extracts config+video combinations from existing filenames and JSON metadata
+3. **CSV Integration**: Loads existing results from CSV files, including both successful and failed tests
+4. **JSON Enrichment**: Loads detailed performance data from JSON metadata files for enhanced analysis
+5. **Skip Completed**: Only processes remaining config+video combinations, with option to retry failed ones
+6. **Seamless Integration**: Updates existing reports and maintains all output files
+
+### Resume Usage
+
+```bash
+# Start a new test run
+python multi_test.py --configs ./configs --videos ./videos --output ./output-multi
+
+# Resume from existing directory (if interrupted)
+python multi_test.py --configs ./configs --videos ./videos --resume "C:\sd\StreamDiffusion\multi_test\20250903_192109"
+
+# Resume with different prompts (will use same output directory)
+python multi_test.py --configs ./configs --videos ./videos --prompts ./prompts.txt --resume "./output-multi/20250903_192109"
+
+# Resume with different timeout for remaining work
+python multi_test.py --configs ./configs --videos ./videos --timeout_seconds 600 --resume "./output-multi/20250903_192109"
+```
+
+### Resume Benefits
+
+- **Time Saving**: No need to reprocess completed combinations
+- **Memory Efficient**: Continues with existing memory management
+- **Progress Preservation**: Maintains all existing output files and reports
+- **Flexible**: Can change prompts or timeout for remaining work
+- **Robust**: Handles various filename formats and edge cases
+
+### Resume Output
+
+When resuming, the test suite will show:
+
+```
+🔄 Resuming from existing directory: C:\sd\StreamDiffusion\multi_test\20250903_192109
+
+🔍 Scanning for completed work in: C:\sd\StreamDiffusion\multi_test\20250903_192109
+📊 Loading existing results from CSV: detailed_results.csv
+✅ Loaded 8 successful results from CSV
+🎬 Scanning video files in directory...
+  📹 Analyzing: sdxl_depth_trt_ta86_cn_lcm_20250903-2317-28.5538735_ta86AllrounderXL_sdxlV1_merged_7prompts
+    ✅ Found completed: sdxl_depth_trt_ta86_cn_lcm + 20250903-2317-28.5538735
+
+📋 Resume Summary:
+  Found 8 completed config+video combinations
+  Found 8 results with performance data
+  Completed combinations:
+    ✅ sdxl_depth_trt_ta86_cn_lcm + 20250903-2317-28.5538735
+    ✅ sdxl_depth_trt_ta86_cn_lcm + 20250903-2319-02.8209091
+    ...
+
+📊 Work Summary:
+  Total combinations: 12
+  Already completed: 8
+  Remaining to process: 4
+  ⏭️  Skipping 8 completed combinations
+🚀 Starting processing of 4 remaining combinations...
+```
+
+### Important Notes
+
+- **Directory Path**: Resume directory must exist and contain previous results
+- **Config/Video Consistency**: Use the same config and video directories as the original run
+- **Flexible Parameters**: Prompts file and timeout can be different for remaining work
+- **Safety Checks**: Double-checks combinations to prevent duplicate processing
+- **Progress Tracking**: Shows clear distinction between resumed and new work
+
+## Prompt Processing Modes
+
+### Mode 1: Config Prompts (Default)
+When no `--prompts` file is provided, the test suite uses the prompts defined in your YAML config files:
+```yaml
+prompt: "A beautiful landscape"
+negative_prompt: "low quality, bad quality, blurry"
+```
+
+### Mode 2: Individual Prompts (Temporal Splitting)
+When `--prompts` file is provided, the test suite:
+1. **Ignores** the `prompt` field in your YAML configs
+2. **Splits the video temporally** across prompts (e.g., 30s video with 3 prompts = 10s each)
+3. **Processes each prompt against its time segment** (much more efficient than full video processing)
+4. **Merges all prompt outputs** into a single combined video
+5. **Reports performance** for each prompt separately
+6. **No pipeline restart** - uses StreamDiffusion's dynamic prompt updating
+
+## Prompts File Format
+
+Create a text file with one prompt per line:
+
+```txt
+A hyperrealistic close-up of a man in a crimson silk, windswept auburn hair framing a freckled face, standing on a sun-drenched beach; fine sand clinging to her bare feet, the vast ocean a turquoise expanse behind her, conveying a sense of serene solitude.
+A cinematic portrait of a woman with flowing golden hair, wearing an elegant emerald dress, standing in a moonlit garden surrounded by blooming roses and twinkling fairy lights.
+A dramatic close-up of a warrior with battle-scarred armor, steely blue eyes reflecting determination, standing against a stormy sky with lightning illuminating ancient castle ruins in the background.
+```
+
+## Temporal Prompt Splitting
+
+The test suite now uses **temporal splitting** for maximum efficiency when processing multiple prompts:
+
+### How It Works
+
+1. **Video Segmentation**: The input video is divided into equal time segments based on the number of prompts
+2. **Frame Distribution**: Each prompt processes only its assigned frames (e.g., frames 1-100 for prompt 1, frames 101-200 for prompt 2)
+3. **Dynamic Prompt Updates**: Uses `stream.update_prompt()` to change prompts without restarting the pipeline
+4. **Efficient Processing**: Each frame is processed only once with its corresponding prompt
+
+### Example: 30-Second Video with 3 Prompts
+
+- **Total Frames**: 900 frames (30fps × 30 seconds)
+- **Prompt 1**: Frames 1-300 (0-10 seconds) → "Stained glass style..."
+- **Prompt 2**: Frames 301-600 (10-20 seconds) → "Cinematic portrait..."
+- **Prompt 3**: Frames 601-900 (20-30 seconds) → "Dramatic warrior..."
+
+### Benefits
+
+- **3x Faster**: Video processed once instead of three times
+- **Memory Efficient**: No duplicate frame storage
+- **Seamless Transitions**: Smooth prompt changes between segments
+- **Professional Quality**: Each time segment gets dedicated prompt processing
+- **Pipeline Optimization**: Leverages StreamDiffusion's dynamic prompt updating
+
+## Directory Structure
+
+```
+project/
+├── configs/                    # Your YAML config files
+│   ├── config1.yaml
+│   ├── config2.yaml
+│   └── ...
+├── videos/                     # Your video files
+│   ├── video1.mp4
+│   ├── video2.avi
+│   └── ...
+├── prompts.txt                 # Individual prompts (optional)
+├── test_results/               # Output directory (created automatically)
+│   ├── test_summary.txt
+│   ├── detailed_results.csv
+│   ├── performance_comparison.txt
+│   ├── config1_video1_merged.mp4  # Merged video (when using prompts)
+│   └── individual_results/
+└── multi_test.py              # The test suite script
+```
+
+## Configuration File Format
+
+Your YAML config files should follow the StreamDiffusion format. When using `--prompts`, the `prompt` field is ignored:
+
+```yaml
+model_id: "runwayml/stable-diffusion-v1-5"
+width: 512
+height: 512
+t_index_list: [32, 40, 45]
+acceleration: "xformers"
+guidance_scale: 1.2
+num_inference_steps: 50
+# prompt: "This is ignored when using --prompts"
+negative_prompt: "low quality, bad quality, blurry"
+use_denoising_batch: true
+cfg_type: "self"
+seed: 42
+```
+
+### Required Fields
+
+- `model_id`: Path to the model checkpoint
+- `width`: Image width (must be multiple of 64)
+- `height`: Image height (must be multiple of 64)
+
+### Optional Fields
+
+- `t_index_list`: Denoising timesteps (default: [32, 40, 45])
+- `acceleration`: Acceleration method (default: "xformers")
+- `guidance_scale`: CFG scale (default: 1.2)
+- `num_inference_steps`: Number of inference steps (default: 50)
+- `negative_prompt`: Negative prompt (default: "low quality, bad quality, blurry")
+- `use_denoising_batch`: Use denoising batch (default: true)
+- `cfg_type`: CFG type (default: "self")
+- `seed`: Random seed (default: 42)
+
+**Note**: When using `--prompts`, the `prompt` field in your config is ignored.
+
+## ControlNet Support
+
+The test suite automatically detects and configures ControlNets from your YAML configuration files:
+
+### ControlNet Configuration Format
+
+```yaml
+model_id: "runwayml/stable-diffusion-v1-5"
+width: 512
+height: 512
+acceleration: "xformers"
+
+# ControlNet configurations
+controlnets:
+  - model_id: "lllyasviel/control_v11p_sd15_canny"
+    preprocessor: "canny"
+    conditioning_scale: 1.0
+    enabled: true
+    preprocessor_params:
+      low_threshold: 100
+      high_threshold: 200
+      
+  - model_id: "lllyasviel/control_v11p_sd15_depth"
+    preprocessor: "depth"
+    conditioning_scale: 0.8
+    enabled: true
+    preprocessor_params:
+      depth_estimator: "dpt_large"
+```
+
+### Supported Preprocessors
+
+- **canny**: Edge detection with configurable thresholds
+- **depth**: Depth estimation using various models
+- **openpose**: Human pose estimation
+- **scribble**: Free-form drawing input
+- **segmentation**: Semantic segmentation
+- **passthrough**: Direct image input without preprocessing
+
+### ControlNet Integration
+
+- **Automatic Loading**: ControlNets are loaded when the pipeline is created
+- **Preprocessor Setup**: Preprocessors are automatically configured with your parameters
+- **Performance Impact**: ControlNet processing is included in FPS measurements
+- **Memory Management**: ControlNet models are properly managed alongside the main pipeline
+
+## Video File Support
+
+The test suite supports common video formats:
+- MP4 (.mp4)
+- AVI (.avi)
+- MOV (.mov)
+- MKV (.mkv)
+- WebM (.webm)
+- FLV (.flv)
+
+Videos are automatically converted to frames at 30 FPS for processing.
+
+## Framerate Matching
+
+The test suite automatically detects and preserves the input video's framerate in all output videos:
+
+### How It Works
+
+1. **Automatic Detection**: Uses `ffprobe` to extract the exact framerate from input videos
+2. **Timing Preservation**: Output videos maintain the same frame timing as input videos
+3. **Frame Distribution**: Generated frames are distributed to match input video timing
+
+### Example Scenarios
+
+**Scenario 1: Input 30fps, Processing 30fps**
+- Input: 100 frames at 30fps (3.33 seconds)
+- Processing: Generates 100 frames
+- Output: 100 frames at 30fps (3.33 seconds) - Perfect match
+
+**Scenario 2: Input 30fps, Processing 15fps**
+- Input: 100 frames at 30fps (3.33 seconds)
+- Processing: Generates 50 frames
+- Output: 50 frames at 30fps (3.33 seconds) - Each frame displayed for 2 input frame durations
+
+**Scenario 3: Input 30fps, Processing 60fps**
+- Input: 100 frames at 30fps (3.33 seconds)
+- Processing: Generates 200 frames
+- Output: 200 frames at 30fps (3.33 seconds) - Each input frame duration shows 2 generated frames
+
+### Benefits
+
+- **Seamless Playback**: Output videos can be played alongside input videos
+- **Consistent Timing**: All output videos maintain original video timing
+- **Professional Quality**: Suitable for video editing and compositing workflows
+- **Frame Accuracy**: Precise frame duration calculations using ffmpeg
+
+## Memory Management
+
+The test suite uses intelligent memory management:
+
+1. **Frame Loading**: Frames are loaded into RAM once per video
+2. **Caching**: Frames are cached and reused across multiple configs/prompts
+3. **Automatic Cleanup**: Frame cache is cleared after processing to free memory
+4. **Memory Estimation**: Each frame typically uses 2-4MB depending on resolution
+
+**Memory Requirements**: Ensure you have sufficient RAM to hold all frames from your longest video. For a 1000-frame 512x512 video, expect ~2-4GB RAM usage.
+
+## Output Files
+
+After running the test suite, you'll get several output files:
+
+### 1. Test Summary (`test_summary.txt`)
+- Overall test statistics
+- Results grouped by configuration
+- Results grouped by video
+- Top 5 performing configurations
+- Prompt processing information (when using `--prompts`)
+
+### 2. Detailed Results (`detailed_results.csv`)
+- CSV format with all test results
+- Individual frame processing times
+- Success/failure status
+- Error messages for failed tests
+- Prompt processing details (when using `--prompts`)
+
+### 3. Performance Comparison (`performance_comparison.txt`)
+- Performance comparison between configurations
+- Average FPS for each config
+- Sorted results by performance
+- Individual prompt performance (when using `--prompts`)
+
+### 4. Individual Results (`*_result.json`)
+- JSON files for each config-video combination
+- Detailed metrics and configuration parameters
+- Frame-by-frame timing data
+- Prompt-by-prompt results (when using `--prompts`)
+
+### 4.1. Video Metadata (`*_metadata.json`)
+- Comprehensive JSON files generated alongside each output video
+- Contains structured data for resume functionality and analysis
+- Structure:
+  - **video_info**: Config filename, video filename, output filename, total frames, prompts used, processing date
+  - **config_details**: Model ID, resolution (width/height), inference steps, guidance scale, negative prompt
+  - **performance_metrics**: Overall FPS, min/max/avg FPS, standard deviation, CV percentage, segment FPS list, total processing time
+  - **technical_details**: Timeout seconds, start/end times, success status
+- Used for enhanced video wall overlays and detailed performance tracking
+
+### 5. Merged Videos (when using `--prompts`)
+- `{config}_{video}_merged.mp4`: Combined video from all prompts
+- Each frame sequence from a prompt is concatenated into the final video
+- Maintains input video framerate and timing
+
+### 6. Single Prompt Videos (when not using `--prompts`)
+- `{config}_{video}_output.mp4`: Output video for single prompt processing
+- Maintains input video framerate and timing
+- Suitable for direct comparison with input videos
+
+### 7. Video Timing Information
+All output videos automatically:
+- Match the input video's framerate (e.g., 30fps, 24fps, 60fps)
+- Preserve the original video's timing and duration
+- Use precise frame duration calculations for professional quality
+
+## Example Output
+
+### With Config Prompts
+```
+StreamDiffusion Multi-Config Test Suite Results
+============================================================
+
+Overall Results:
+  Total tests: 6
+  Successful: 6
+  Failed: 0
+  Success rate: 100.0%
+
+Quick Performance Summary:
+----------------------------------------------------------------------------------------------------
+Config                    Video         Resolution   Overall FPS  Avg FPS   Min FPS   Max FPS   Frames
+----------------------------------------------------------------------------------------------------
+config1                   video1.mp4    512x512     15.23        15.23     14.89     15.67     300
+config1                   video2.mp4    512x512     14.89        14.89     14.50     15.28     300
+config2                   video1.mp4    512x512     18.45        18.45     18.10     18.80     300
+----------------------------------------------------------------------------------------------------
+
+Results by Config:
+  config1:
+    Tests: 2/2 successful
+    Model: runwayml/stable-diffusion-v1-5
+    Resolution: 512x512
+    ✅ video1.mp4 (300 frames) - Overall FPS: 15.23, Min FPS: 14.89, Max FPS: 15.67, Avg FPS: 15.23, CV: 2.5%
+    ✅ video2.mp4 (300 frames) - Overall FPS: 14.89, Min FPS: 14.50, Max FPS: 15.28, Avg FPS: 14.89, CV: 2.7%
+
+Overall FPS Rankings (Higher is Better):
+  1. config2 - 18.45 FPS (Avg: 18.45, Range: 18.10-18.80)
+  2. config1 - 15.23 FPS (Avg: 15.23, Range: 14.89-15.67)
+
+Performance Statistics:
+  Overall FPS - Best: 18.45, Worst: 14.89, Mean: 16.19
+  Average FPS - Best: 18.45, Worst: 14.89, Mean: 16.19
+  Min FPS - Best: 18.10, Worst: 14.50, Mean: 15.83
+  Max FPS - Best: 18.80, Worst: 15.28, Mean: 16.58
+
+Recommendations:
+🏆 Best Overall Performance: config2
+   - Highest sustained FPS: 18.45
+   - Best for: Maximum throughput scenarios
+
+📊 Most Consistent Performance: config1
+   - Lowest variance: 2.5% CV
+   - Best for: Real-time applications requiring stable frame rates
+```
+
+### With Individual Prompts
+```
+StreamDiffusion Multi-Config Test Suite Results
+Using 3 individual prompts from prompts.txt
+============================================================
+
+Overall Results:
+  Total tests: 6
+  Successful: 6
+  Failed: 0
+  Success rate: 100.0%
+
+Quick Performance Summary:
+----------------------------------------------------------------------------------------------------
+Config                    Video         Resolution   Overall FPS  Avg FPS   Min FPS   Max FPS   Frames
+----------------------------------------------------------------------------------------------------
+config1                   video1.mp4    512x512     15.23        15.23     14.89     15.67     900
+config1                   video2.mp4    512x512     14.89        14.89     14.50     15.28     900
+config2                   video1.mp4    512x512     18.45        18.45     18.10     18.80     900
+----------------------------------------------------------------------------------------------------
+
+Results by Config:
+  config1:
+    Tests: 2/2 successful
+    Model: runwayml/stable-diffusion-v1-5
+    Resolution: 512x512
+    Prompts processed: 3/3 successful
+    ✅ video1.mp4 (900 frames, 3 prompts) - Overall FPS: 15.23, Min FPS: 14.89, Max FPS: 15.67, Avg FPS: 15.23, CV: 2.5%
+    ✅ video2.mp4 (900 frames, 3 prompts) - Overall FPS: 14.89, Min FPS: 14.50, Max FPS: 15.28, Avg FPS: 14.89, CV: 2.7%
+
+Performance Consistency Analysis:
+Configs ranked by FPS stability (lower variance = more stable):
+  1. config1 - CV: 2.5% (Std: 0.38, Range: 0.78)
+      Mean: 15.23 FPS, Min: 14.89, Max: 15.67
+  2. config2 - CV: 2.1% (Std: 0.35, Range: 0.70)
+      Mean: 18.45 FPS, Min: 18.10, Max: 18.80
+
+Best Config per Video (Overall FPS):
+------------------------------------------------------------
+video1.mp4            -> config2                 (18.45 FPS, Avg: 18.45)
+video2.mp4            -> config1                 (14.89 FPS, Avg: 14.89)
+
+Performance Improvement Analysis:
+Best Overall Config: config2 (18.45 FPS)
+
+Performance vs Best (Overall FPS):
+  config1 - 15.23 FPS (+21.1% vs best)
+
+Recommendations:
+⚖️  Best Balanced (Performance + Consistency): config2
+   - Balanced score: 0.850
+   - Performance: 18.45 FPS, Consistency: 2.1% CV
+   - Best for: Production environments requiring both speed and reliability
+```
+
+## Performance Tips
+
+1. **Use Temporal Splitting**: With `--prompts`, videos are processed once instead of multiple times (3x faster)
+2. **Use TensorRT**: Set `acceleration: "tensorrt"` in your configs for best performance
+3. **Optimize t_index_list**: Lower values (e.g., [10, 15]) for faster processing, higher values for better quality
+4. **Batch Processing**: Enable `use_denoising_batch: true` for better throughput
+5. **Resolution**: Lower resolutions process faster but may reduce quality
+6. **Model Selection**: Smaller models (SD1.5 vs SDXL) generally process faster
+7. **Prompt Length**: Shorter prompts generally process faster than very long, detailed ones
+8. **RAM Optimization**: The suite automatically caches frames in RAM for maximum speed
+9. **Framerate Optimization**: Output videos automatically match input timing for professional workflows
+10. **Dynamic Prompt Updates**: Leverages StreamDiffusion's built-in prompt switching without pipeline restarts
+11. **ControlNet Optimization**: Use fewer ControlNets and lower conditioning scales for faster processing
+12. **Preprocessor Selection**: Choose efficient preprocessors (e.g., passthrough > canny > depth > openpose)
+
+## Troubleshooting
+
+### Common Issues
+
+1. **ffmpeg not found**: Install ffmpeg and ensure it's in your PATH
+2. **CUDA out of memory**: Use memory management options (see below)
+3. **Config validation errors**: Check that required fields are present in your YAML files
+4. **Model loading failures**: Verify model paths and ensure models are accessible
+5. **Video merging fails**: Ensure ffmpeg supports the concat demuxer
+6. **Out of memory**: Reduce video resolution or frame count, or close other applications
+7. **Framerate detection fails**: Ensure ffprobe is available and input videos are valid
+
+### CUDA Memory Issues
+
+If you encounter `CUDA out of memory` errors, the test suite now includes several memory management features:
+
+#### 1. Batch Processing
+Process frames in smaller batches to reduce memory usage:
+```bash
+# Default: 10 frames per batch
+python multi_test.py --configs ./configs --videos ./videos
+
+# Reduce to 5 frames per batch for lower memory usage
+python multi_test.py --configs ./configs --videos ./videos --batch-size 5
+
+# Very conservative: 3 frames per batch
+python multi_test.py --configs ./configs --videos ./videos --batch-size 3
+```
+
+#### 2. Memory Threshold Management
+Set when automatic memory cleanup should occur:
+```bash
+# Default: Cleanup when less than 2GB free
+python multi_test.py --configs ./configs --videos ./videos
+
+# More aggressive: Cleanup when less than 1GB free
+python multi_test.py --configs ./configs --videos ./videos --memory-threshold 1.0
+
+# Very aggressive: Cleanup when less than 0.5GB free
+python multi_test.py --configs ./configs --videos ./videos --memory-threshold 0.5
+```
+
+#### 3. Frame Skipping
+For very long videos, process every Nth frame to reduce memory and time:
+```bash
+# Process every frame (default)
+python multi_test.py --configs ./configs --videos ./videos
+
+# Process every 2nd frame (2x faster, 2x less memory)
+python multi_test.py --configs ./configs --videos ./videos --frame-skip 2
+
+# Process every 3rd frame (3x faster, 3x less memory)
+python multi_test.py --configs ./configs --videos ./videos --frame-skip 3
+```
+
+#### 4. Combined Memory Management
+Use all options together for maximum memory efficiency:
+```bash
+python multi_test.py --configs ./configs --videos ./videos \
+    --batch-size 3 \
+    --memory-threshold 0.5 \
+    --frame-skip 2
+```
+
+#### 5. Automatic Memory Recovery
+The test suite now automatically:
+- Monitors GPU memory usage in real-time
+- Cleans up memory after each batch and prompt
+- Retries failed frames after memory cleanup
+- Provides detailed memory status information
+- Gracefully handles out-of-memory errors without crashing
+
+### Debug Mode
+
+For detailed logging, you can modify the script to add more verbose output or check the individual result JSON files for specific error details.
+
+## Advanced Usage
+
+### Custom Frame Extraction
+
+You can modify the `extract_frames_from_video` method to customize frame extraction parameters (FPS, format, etc.).
+
+### Custom Metrics
+
+Extend the `TestResult` dataclass to include additional metrics like memory usage, GPU utilization, etc.
+
+### Parallel Processing
+
+For faster testing, you could modify the suite to process multiple configs in parallel (requires careful resource management).
+
+### Custom Video Merging
+
+Modify the `merge_videos_from_prompts` method to customize how videos are combined (different frame rates, transitions, etc.).
+
+### Framerate Customization
+
+You can modify the `get_video_framerate` method to implement custom framerate detection logic or override framerates for specific use cases.
+
+## Example Workflow
+
+### With Individual Prompts
+
+1. **Setup**: Create directories and add your configs, videos, and prompts file
+2. **Run Tests**: Execute the test suite with `--prompts prompts.txt`
+3. **Analyze Results**: Review the generated reports and merged videos
+4. **Optimize**: Use results to tune your configurations and prompts
+5. **Iterate**: Run tests again with optimized configs
+
+### Without Individual Prompts
+
+1. **Setup**: Create directories and add your configs and videos
+2. **Run Tests**: Execute the test suite (uses config prompts)
+3. **Analyze Results**: Review the generated reports and output videos
+4. **Optimize**: Use results to tune your configurations
+5. **Iterate**: Run tests again with optimized configs
+
+## Contributing
+
+Feel free to extend the test suite with additional features:
+- Memory usage tracking
+- GPU utilization monitoring
+- Quality metrics (PSNR, SSIM)
+- Automated optimization suggestions
+- Integration with CI/CD pipelines
+- Custom video effects and transitions
+- Prompt performance analysis and optimization
+- Advanced memory management strategies
+- Custom framerate handling and video processing
\ No newline at end of file
diff --git a/multi_test/enhanced_video_wall.py b/multi_test/enhanced_video_wall.py
new file mode 100644
index 00000000..42c09910
--- /dev/null
+++ b/multi_test/enhanced_video_wall.py
@@ -0,0 +1,515 @@
+#!/usr/bin/env python3
+"""
+Enhanced Video Wall Creator using JSON metadata
+
+This module creates video walls with rich metadata information from JSON files
+stored alongside each processed video, providing better data for resume functionality.
+"""
+
+import os
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+
+try:
+    import ffmpeg
+except ImportError:
+    print("Warning: ffmpeg-python library not found. Enhanced video wall creation will be disabled.")
+    ffmpeg = None
+
+def load_video_metadata(results_dir: str) -> Dict[str, Dict]:
+    """
+    Load all JSON metadata files from results directory.
+    
+    Parameters
+    ----------
+    results_dir : str
+        Directory containing video results and JSON metadata
+        
+    Returns
+    -------
+    Dict[str, Dict]
+        Dictionary mapping video filenames to their metadata
+    """
+    metadata_dict = {}
+    
+    if not os.path.exists(results_dir):
+        return metadata_dict
+    
+    print(f"📋 Loading video metadata from: {results_dir}")
+    
+    try:
+        json_files = [f for f in os.listdir(results_dir) if f.endswith('_metadata.json')]
+        
+        for json_file in json_files:
+            json_path = os.path.join(results_dir, json_file)
+            try:
+                with open(json_path, 'r', encoding='utf-8') as f:
+                    metadata = json.load(f)
+                    
+                # Extract video filename from metadata
+                video_info = metadata.get('video_info', {})
+                output_filename = video_info.get('output_filename', '')
+                
+                if output_filename:
+                    metadata_dict[output_filename] = metadata
+                    
+            except Exception as e:
+                print(f"  ⚠️  Warning: Could not load {json_file}: {e}")
+        
+        print(f"  ✅ Loaded metadata for {len(metadata_dict)} videos")
+        
+    except Exception as e:
+        print(f"  ❌ Error loading metadata: {e}")
+    
+    return metadata_dict
+
+def create_enhanced_video_with_metadata(
+    input_path: str, 
+    output_path: str, 
+    metadata: Dict,
+    width: int, 
+    height: int
+) -> bool:
+    """
+    Create a scaled video with enhanced metadata overlay using ffmpeg-python
+    
+    Args:
+        input_path: Path to input video
+        output_path: Path to output video
+        metadata: Video metadata dictionary
+        width: Target width
+        height: Target height
+    
+    Returns:
+        True if successful, False otherwise
+    """
+    if ffmpeg is None:
+        return False
+        
+    try:
+        # Extract key information from metadata
+        video_info = metadata.get('video_info', {})
+        config_details = metadata.get('config_details', {})
+        performance = metadata.get('performance_metrics', {})
+        
+        config_name = video_info.get('config_filename', 'Unknown')
+        model_name = config_details.get('model_id', 'Unknown').split('/')[-1]
+        resolution = f"{config_details.get('width', '?')}x{config_details.get('height', '?')}"
+        overall_fps = performance.get('overall_fps', 0)
+        avg_fps = performance.get('avg_fps', 0)
+        total_frames = video_info.get('total_frames', 0)
+        processing_time = performance.get('total_processing_time', 0)
+        
+        # Create multi-line text overlay with rich information
+        text_lines = [
+            f"Config: {config_name}",
+            f"Model: {model_name}",
+            f"Resolution: {resolution}",
+            f"Frames: {total_frames}",
+            f"Overall FPS: {overall_fps:.1f}",
+            f"Avg FPS: {avg_fps:.1f}",
+            f"Time: {processing_time:.1f}s"
+        ]
+        
+        # Create the processing pipeline
+        stream = ffmpeg.input(input_path)
+        
+        # Scale video to target size with padding
+        scaled = ffmpeg.filter(
+            stream, 
+            'scale', 
+            width, height,
+            force_original_aspect_ratio='decrease'
+        )
+        
+        padded = ffmpeg.filter(
+            scaled,
+            'pad',
+            width, height,
+            '(ow-iw)/2', '(oh-ih)/2'
+        )
+        
+        # Add multi-line text overlay
+        font_path = r'C:/Windows/Fonts/arial.ttf'
+        
+        # Start with the padded video
+        current_stream = padded
+        
+        # Add each line of text
+        for i, line in enumerate(text_lines):
+            y_position = 10 + (i * 25)  # 25 pixels between lines
+            
+            if os.path.exists(font_path):
+                current_stream = ffmpeg.drawtext(
+                    current_stream,
+                    text=line,
+                    fontfile=font_path,
+                    fontcolor='white',
+                    fontsize=16,
+                    box=1,
+                    boxcolor='black@0.8',
+                    boxborderw=3,
+                    x=10,
+                    y=y_position
+                )
+            else:
+                # Fallback without fontfile
+                current_stream = ffmpeg.drawtext(
+                    current_stream,
+                    text=line,
+                    fontcolor='white',
+                    fontsize=16,
+                    box=1,
+                    boxcolor='black@0.8',
+                    boxborderw=3,
+                    x=10,
+                    y=y_position
+                )
+        
+        # Output with encoding settings
+        output = ffmpeg.output(
+            current_stream,
+            output_path,
+            vcodec='libx264',
+            crf=23,
+            preset='medium'
+        )
+        
+        # Run the pipeline
+        ffmpeg.run(output, overwrite_output=True, quiet=True)
+        return True
+        
+    except Exception as e:
+        print(f"    Error processing enhanced video: {e}")
+        return False
+
+def create_enhanced_video_wall(
+    results_dir: str,
+    output_path: str,
+    grid_width: int = 3,  # Kept for backward compatibility, but not used in new layout
+    video_width: int = 512,
+    video_height: int = 512
+) -> Optional[str]:
+    """
+    Create an enhanced video wall using JSON metadata for rich information display.
+
+    The layout is automatically determined: each row represents one original video,
+    and columns represent different configurations (including original).
+
+    Parameters
+    ----------
+    results_dir : str
+        Directory containing video results and JSON metadata
+    output_path : str
+        Path for the output video wall
+    grid_width : int, optional
+        Legacy parameter kept for backward compatibility (not used in new layout)
+    video_width : int, optional
+        Width of each video in the wall, by default 512
+    video_height : int, optional
+        Height of each video in the wall, by default 512
+
+    Returns
+    -------
+    Optional[str]
+        Path to the created video wall, or None if failed
+    """
+    
+    print("\n🎬 Creating enhanced video wall with JSON metadata...")
+    
+    if ffmpeg is None:
+        print("  ❌ Skipping - ffmpeg-python not available")
+        return None
+    
+    # Load metadata for all videos
+    metadata_dict = load_video_metadata(results_dir)
+    
+    if not metadata_dict:
+        print("  ❌ No video metadata found")
+        return None
+    
+    # Find corresponding video files
+    video_files = []
+    enhanced_metadata = []
+    
+    for filename, metadata in metadata_dict.items():
+        video_path = os.path.join(results_dir, filename)
+        if os.path.exists(video_path):
+            video_files.append(video_path)
+            enhanced_metadata.append(metadata)
+        else:
+            print(f"  ⚠️  Warning: Video file not found: {filename}")
+    
+    if not video_files:
+        print("  ❌ No video files found")
+        return None
+    
+    print(f"  📹 Processing {len(video_files)} videos for enhanced wall")
+    
+    # Create temporary directory for processing
+    temp_dir = os.path.join(os.path.dirname(output_path), "temp_enhanced_wall")
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    try:
+        # Process each video with enhanced metadata overlay
+        processed_videos = []
+        min_duration = float('inf')
+        
+        print("  🔄 Processing videos with enhanced metadata...")
+        for i, (video_path, metadata) in enumerate(zip(video_files, enhanced_metadata)):
+            print(f"    Processing {i+1}/{len(video_files)}: {os.path.basename(video_path)}")
+            
+            # Get video duration
+            try:
+                probe = ffmpeg.probe(video_path)
+                duration = float(probe['format']['duration'])
+                min_duration = min(min_duration, duration)
+            except Exception as e:
+                print(f"      Warning: Could not get duration: {e}")
+                duration = 10
+                min_duration = min(min_duration, duration)
+            
+            # Create enhanced video with metadata overlay
+            enhanced_path = os.path.join(temp_dir, f"enhanced_{i:03d}.mp4")
+            
+            success = create_enhanced_video_with_metadata(
+                video_path,
+                enhanced_path,
+                metadata,
+                video_width,
+                video_height
+            )
+            
+            if success:
+                processed_videos.append(enhanced_path)
+                print(f"      ✅ Enhanced video created")
+            else:
+                print(f"      ❌ Failed to enhance video")
+        
+        if not processed_videos:
+            print("  ❌ No videos were successfully processed")
+            return None
+        
+        if min_duration == float('inf'):
+            min_duration = 10
+
+        print(f"  🎬 Creating video wall with flipped layout...")
+        
+        # Create video wall grid
+        input_streams = []
+        
+        # Load all processed videos as input streams
+        for video_path in processed_videos:
+            stream = ffmpeg.input(video_path)
+            input_streams.append(stream)
+        
+        # Pad with blank videos if needed to fill grid
+        total_videos = len(input_streams)
+        videos_per_row = grid_width
+        rows_needed = (total_videos + videos_per_row - 1) // videos_per_row
+        total_slots = rows_needed * videos_per_row
+        
+        # Create blank videos for empty slots
+        for i in range(total_videos, total_slots):
+            blank_path = os.path.join(temp_dir, f"blank_{i}.mp4")
+            
+            # Create a blank video
+            blank_input = ffmpeg.input(
+                f'color=c=gray:s={video_width}x{video_height}:d={min_duration}',
+                f='lavfi'
+            )
+            
+            blank_with_text = ffmpeg.drawtext(
+                blank_input,
+                text='No Video',
+                fontcolor='white',
+                fontsize=24,
+                x='(w-text_w)/2',
+                y='(h-text_h)/2'
+            )
+            
+            blank_output = ffmpeg.output(blank_with_text, blank_path, vcodec='libx264', crf=23)
+            ffmpeg.run(blank_output, overwrite_output=True, quiet=True)
+            
+            input_streams.append(ffmpeg.input(blank_path))
+        
+        # Create rows with flipped layout: each row = one original video + all its config outputs
+        # This assumes videos are ordered as: original_video1, config1_video1, config2_video1, ..., original_video2, config1_video2, etc.
+
+        # First, determine how many configs (including original) we have
+        # We need to figure this out from the metadata
+        if processed_videos:
+            # Get unique config names from metadata
+            config_names = set()
+            for metadata in enhanced_metadata:
+                config_names.add(metadata.get('video_info', {}).get('config_filename', 'Unknown'))
+            config_names = ['original'] + sorted(list(config_names))
+
+            # Get unique video names
+            video_names = set()
+            for metadata in enhanced_metadata:
+                # Extract original video name from the output filename
+                output_filename = metadata.get('video_info', {}).get('output_filename', '')
+                if output_filename:
+                    # Try to extract video name - this depends on naming convention
+                    # Assuming format like: configName_videoName_merged_5prompts.mp4
+                    parts = output_filename.replace('.mp4', '').split('_')
+                    if len(parts) >= 2:
+                        video_name = parts[1]  # Second part should be video name
+                        video_names.add(video_name)
+            video_names = sorted(list(video_names))
+
+            print(f"  Detected {len(video_names)} videos and {len(config_names)} configs (including original)")
+            print(f"  Flipped layout: {len(video_names)} rows x {len(config_names)} columns")
+
+            # Reorder streams for flipped layout
+            reordered_streams = []
+            for video_name in video_names:
+                for config_name in config_names:
+                    # Find the stream for this video+config combination
+                    found = False
+                    for i, metadata in enumerate(enhanced_metadata):
+                        video_info = metadata.get('video_info', {})
+                        if (video_info.get('config_filename') == config_name or
+                            (config_name == 'original' and 'original' in video_info.get('output_filename', ''))):
+                            # Check if this is the right video
+                            output_filename = video_info.get('output_filename', '')
+                            if video_name in output_filename:
+                                reordered_streams.append(input_streams[i])
+                                found = True
+                                break
+
+                    if not found:
+                        # Create placeholder for missing combination
+                        placeholder_path = os.path.join(temp_dir, f"placeholder_{video_name}_{config_name}.mp4")
+                        placeholder_text = f"MISSING_{config_name}_{video_name}".replace(' ', '_')
+
+                        try:
+                            blank_input = ffmpeg.input(
+                                f'color=c=gray:s={video_width}x{video_height}:d={min_duration}',
+                                f='lavfi'
+                            )
+
+                            blank_with_text = ffmpeg.drawtext(
+                                blank_input,
+                                text=placeholder_text,
+                                fontcolor='white',
+                                fontsize=24,
+                                x='(w-text_w)/2',
+                                y='(h-text_h)/2'
+                            )
+
+                            blank_output = ffmpeg.output(blank_with_text, placeholder_path, vcodec='libx264', crf=23)
+                            ffmpeg.run(blank_output, overwrite_output=True, quiet=True)
+
+                            reordered_streams.append(ffmpeg.input(placeholder_path))
+                        except Exception as e:
+                            print(f"    Failed to create placeholder: {e}")
+                            return None
+
+            # Create rows from reordered streams
+            rows = []
+            for row_idx in range(len(video_names)):
+                start_idx = row_idx * len(config_names)
+                end_idx = start_idx + len(config_names)
+                row_streams = reordered_streams[start_idx:end_idx]
+
+                if len(row_streams) > 1:
+                    row_combined = ffmpeg.filter(row_streams, 'hstack', inputs=len(row_streams))
+                else:
+                    row_combined = row_streams[0]
+
+                rows.append(row_combined)
+
+            # Combine rows vertically
+            if len(rows) > 1:
+                final_grid = ffmpeg.filter(rows, 'vstack', inputs=len(rows))
+            else:
+                final_grid = rows[0]
+        else:
+            # Fallback to original logic if metadata parsing fails
+            print("  Warning: Could not parse metadata for flipped layout, using original grid")
+            rows = []
+            for row_idx in range(rows_needed):
+                start_idx = row_idx * videos_per_row
+                end_idx = min(start_idx + videos_per_row, len(input_streams))
+                row_streams = input_streams[start_idx:end_idx]
+
+                if len(row_streams) > 1:
+                    row_combined = ffmpeg.filter(row_streams, 'hstack', inputs=len(row_streams))
+                else:
+                    row_combined = row_streams[0]
+
+                rows.append(row_combined)
+
+            # Combine rows vertically
+            if len(rows) > 1:
+                final_grid = ffmpeg.filter(rows, 'vstack', inputs=len(rows))
+            else:
+                final_grid = rows[0]
+        
+        # Trim to minimum duration and output
+        trimmed = ffmpeg.filter(final_grid, 'trim', duration=min_duration)
+        final_output = ffmpeg.output(
+            trimmed,
+            output_path,
+            vcodec='libx264',
+            crf=20,
+            preset='medium'
+        )
+        
+        print("  🎬 Rendering final enhanced video wall...")
+        ffmpeg.run(final_output, overwrite_output=True, quiet=True)
+        
+        print(f"  ✅ Enhanced video wall created: {output_path}")
+        
+        # Clean up temporary files
+        print("  🧹 Cleaning up temporary files...")
+        try:
+            import shutil
+            shutil.rmtree(temp_dir)
+        except Exception as e:
+            print(f"    Warning: Could not clean up temp directory: {e}")
+        
+        return output_path
+        
+    except Exception as e:
+        print(f"  ❌ Error creating enhanced video wall: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+def main():
+    """Example usage of enhanced video wall creation."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Create enhanced video wall with JSON metadata")
+    parser.add_argument("--results_dir", required=True, help="Directory containing video results and JSON metadata")
+    parser.add_argument("--output", required=True, help="Output path for video wall")
+    parser.add_argument("--grid_width", type=int, default=3, help="Number of videos per row")
+    parser.add_argument("--video_width", type=int, default=512, help="Width of each video")
+    parser.add_argument("--video_height", type=int, default=512, help="Height of each video")
+    
+    args = parser.parse_args()
+    
+    result = create_enhanced_video_wall(
+        args.results_dir,
+        args.output,
+        args.grid_width,
+        args.video_width,
+        args.video_height
+    )
+    
+    if result:
+        print(f"\n✅ Enhanced video wall created successfully: {result}")
+        return 0
+    else:
+        print(f"\n❌ Failed to create enhanced video wall")
+        return 1
+
+if __name__ == "__main__":
+    exit(main())
+
+
+
diff --git a/multi_test/multi_test.py b/multi_test/multi_test.py
new file mode 100644
index 00000000..e736a374
--- /dev/null
+++ b/multi_test/multi_test.py
@@ -0,0 +1,2076 @@
+#!/usr/bin/env python3
+"""
+StreamDiffusion Multi-Config Test Suite
+
+This script processes multiple videos with multiple YAML configurations,
+similar to main.py but for batch testing. It can use individual prompts
+from a text file or config prompts.
+
+Key Features:
+- Memory-efficient processing with automatic cleanup between configs
+- One merged video output per config (combining all prompt segments)
+- Real-time memory monitoring and cleanup
+- Pipeline reset between configs to prevent memory issues
+
+Usage:
+    python multi_test.py --configs ./configs --videos ./videos --output ./results
+    python multi_test.py --configs ./configs --videos ./videos --prompts ./prompts.txt --output ./results
+    python multi_test.py --configs ./configs --videos ./videos --output ./results --timeout_seconds 600
+
+Based on the StreamDiffusion framework and main.py architecture.
+"""
+
+import os
+import datetime
+import sys
+import time
+import yaml
+import argparse
+import signal
+import atexit
+import subprocess
+import csv
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Set
+
+try:
+    import fire
+except ImportError:
+    print("Error: 'fire' package not found. Please install it with: pip install fire")
+    sys.exit(1)
+
+try:
+    import ffmpeg
+except ImportError:
+    print("Warning: ffmpeg-python library not found. Video wall creation will be disabled.")
+    print("To enable video wall creation, install it with: pip install ffmpeg-python")
+    ffmpeg = None
+
+# Import enhanced video wall functions if available
+try:
+    from enhanced_video_wall import create_enhanced_video_with_metadata
+except ImportError:
+    print("Warning: Enhanced video wall module not found. Will use fallback video processing.")
+    create_enhanced_video_with_metadata = None
+
+import torch
+from torchvision.io import read_video, write_video
+from torchvision.transforms import functional as F
+from tqdm import tqdm
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from streamdiffusion import StreamDiffusionWrapper, load_config, create_wrapper_from_config
+
+# Global cleanup flag
+_cleanup_completed = False
+
+def signal_handler(signum, frame):
+    """Handle system signals to ensure cleanup before exit."""
+    print(f"\nReceived signal {signum}, cleaning up...")
+    cleanup_and_exit()
+    sys.exit(1)
+
+def cleanup_and_exit():
+    """Ensure cleanup is performed before exit."""
+    global _cleanup_completed
+    if not _cleanup_completed:
+        print("Performing final cleanup...")
+        try:
+            # Multiple rounds of cleanup to ensure everything is freed
+            for cleanup_round in range(3):
+                cleanup_gpu_memory()
+                
+                # Force garbage collection
+                import gc
+                gc.collect()
+                
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                    torch.cuda.empty_cache()
+                
+                # Small delay between cleanup rounds
+                import time
+                time.sleep(0.1)
+                
+        except Exception as e:
+            print(f"Warning: Final cleanup failed: {e}")
+        _cleanup_completed = True
+        print("Cleanup completed.")
+
+# Register signal handlers and exit handler
+signal.signal(signal.SIGINT, signal_handler)
+signal.signal(signal.SIGTERM, signal_handler)
+atexit.register(cleanup_and_exit)
+
+def cleanup_gpu_memory():
+    """Thorough GPU memory cleanup."""
+    try:
+        if torch.cuda.is_available():
+            # Clear PyTorch cache
+            torch.cuda.empty_cache()
+            
+            # Synchronize to ensure all operations are complete
+            torch.cuda.synchronize()
+            
+            # Force garbage collection
+            import gc
+            gc.collect()
+            
+            # Log memory after cleanup
+            memory_info = get_memory_info()
+            if memory_info:
+                print(f"    Memory after cleanup: GPU allocated: {memory_info['gpu_allocated']:.2f} GB, "
+                      f"reserved: {memory_info['gpu_reserved']:.2f} GB, free: {memory_info['gpu_free']:.2f} GB")
+    except Exception as e:
+        print(f"    Warning: Memory cleanup failed: {e}")
+        pass
+
+def cleanup_pipeline(pipeline):
+    """Properly cleanup a pipeline and free VRAM using StreamDiffusion's built-in cleanup"""
+    if pipeline is None:
+        return
+        
+    try:
+        print("    Starting pipeline cleanup...")
+        
+        # Use StreamDiffusion's built-in cleanup method which properly handles:
+        # - TensorRT engine cleanup
+        # - ControlNet engine cleanup  
+        # - Multiple garbage collection cycles
+        # - CUDA cache clearing
+        # - Memory tracking
+        if hasattr(pipeline, 'stream') and pipeline.stream and hasattr(pipeline.stream, 'cleanup_gpu_memory'):
+            pipeline.stream.cleanup_gpu_memory()
+            print("    Pipeline cleanup completed using StreamDiffusion cleanup")
+        elif hasattr(pipeline, 'cleanup_gpu_memory') and callable(getattr(pipeline, 'cleanup_gpu_memory')):
+            pipeline.cleanup_gpu_memory()
+            print("    Pipeline cleanup completed using pipeline cleanup method")
+        elif hasattr(pipeline, 'cleanup') and callable(getattr(pipeline, 'cleanup')):
+            pipeline.cleanup()
+            print("    Pipeline cleanup completed using generic cleanup method")
+        else:
+            # Fallback cleanup if the method doesn't exist
+            print("    StreamDiffusion cleanup method not found, using fallback cleanup")
+            if hasattr(pipeline, 'stream') and pipeline.stream:
+                del pipeline.stream
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                
+    except Exception as e:
+        print(f"    Error during pipeline cleanup: {e}")
+        # Still try to clear CUDA cache even if cleanup fails
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+def preprocess_video(input_video_path: str, target_width: int, target_height: int) -> torch.Tensor:
+    """Memory-efficient video preprocessing to target resolution, maintaining aspect ratio."""
+    print(f"Preprocessing video: {input_video_path}")
+    print(f"  Target resolution: {target_width}x{target_height}")
+    
+    # Load video metadata first to check size
+    video_data, _, info = read_video(input_video_path, pts_unit='sec')
+    original_fps = info["video_fps"]
+    num_frames = video_data.shape[0]
+    print(f"  Original FPS: {original_fps}")
+    print(f"  Loaded video shape: {video_data.shape}")
+    
+    # Calculate memory usage and warn if large
+    estimated_memory_gb = (num_frames * target_height * target_width * 3 * 4) / (1024**3)  # 4 bytes per float32
+    print(f"  Estimated memory usage: {estimated_memory_gb:.2f} GB")
+    
+    if estimated_memory_gb > 4.0:
+        print(f"  ⚠️  WARNING: Large video detected! Consider using batch processing for videos > 4GB")
+    
+    # Calculate resize parameters once
+    original_height, original_width = video_data.shape[1], video_data.shape[2]
+    original_aspect = original_width / original_height
+    target_aspect = target_width / target_height
+    
+    if original_aspect > target_aspect:
+        scale_height = target_height
+        scale_width = int(scale_height * original_aspect)
+    else:
+        scale_width = target_width
+        scale_height = int(scale_width / original_aspect)
+    
+    print(f"  Resizing and cropping frames...")
+    
+    # Pre-allocate output tensor to avoid memory fragmentation
+    resized_video = torch.zeros(num_frames, target_height, target_width, 3, dtype=torch.float32)
+    
+    # Process frames in smaller batches to reduce peak memory usage
+    batch_size = min(50, num_frames)  # Process 50 frames at a time
+    
+    for batch_start in tqdm(range(0, num_frames, batch_size), desc="  Processing batches"):
+        batch_end = min(batch_start + batch_size, num_frames)
+        
+        # Process batch of frames
+        for i in range(batch_start, batch_end):
+            # Convert to float and normalize (in-place to save memory)
+            frame = video_data[i].float() / 255.0  # Shape: (H, W, C)
+            frame_chw = frame.permute(2, 0, 1)
+            
+            # Resize maintaining aspect ratio
+            resized_frame_chw = F.resize(frame_chw, [scale_height, scale_width], antialias=True)
+            cropped_frame_chw = F.center_crop(resized_frame_chw, [target_height, target_width])
+            final_frame = cropped_frame_chw.permute(1, 2, 0)
+            
+            # Store directly in pre-allocated tensor
+            resized_video[i] = final_frame
+            
+            # Clean up intermediate tensors
+            del frame, frame_chw, resized_frame_chw, cropped_frame_chw, final_frame
+        
+        # Force garbage collection after each batch
+        import gc
+        gc.collect()
+    
+    # Clean up original video data
+    del video_data
+    gc.collect()
+    
+    print(f"  Final processed video shape: {resized_video.shape}")
+    print(f"  Memory cleanup completed")
+    return resized_video
+
+def load_prompts(prompts_file: str) -> List[str]:
+    """Load prompts from text file."""
+    with open(prompts_file, 'r', encoding='utf-8') as f:
+        prompts = [line.strip() for line in f.readlines() if line.strip()]
+    print(f"Loaded {len(prompts)} prompts from {prompts_file}")
+    return prompts
+
+def scan_completed_work(resume_dir: str) -> List[Dict]:
+    """
+    Load existing results from CSV and JSON metadata if available.
+    
+    Parameters
+    ----------
+    resume_dir : str
+        Path to existing output directory to resume from
+        
+    Returns
+    -------
+    List[Dict]
+        List of existing results (both successful and failed from CSV if available)
+    """
+    print(f"\n🔍 Scanning for completed work in: {resume_dir}")
+    
+    if not os.path.exists(resume_dir):
+        print(f"❌ Resume directory does not exist: {resume_dir}")
+        return []
+    
+    existing_results = []
+    json_metadata = {}  # Store JSON metadata by video filename
+    
+    # First, scan for JSON metadata files
+    print(f"📋 Scanning for JSON metadata files...")
+    try:
+        import json
+        json_files = [f for f in os.listdir(resume_dir) if f.endswith('_metadata.json')]
+        for json_file in json_files:
+            json_path = os.path.join(resume_dir, json_file)
+            try:
+                with open(json_path, 'r', encoding='utf-8') as f:
+                    metadata = json.load(f)
+                    # Use output filename as key for easy lookup
+                    output_filename = metadata.get('video_info', {}).get('output_filename', '')
+                    if output_filename:
+                        json_metadata[output_filename] = metadata
+            except Exception as e:
+                print(f"  ⚠️  Warning: Could not load JSON metadata {json_file}: {e}")
+        
+        print(f"  Found {len(json_metadata)} JSON metadata files")
+    except Exception as e:
+        print(f"  ⚠️  Warning: Error scanning JSON files: {e}")
+    
+    # Try to load existing results from CSV
+    csv_path = os.path.join(resume_dir, "detailed_results.csv")
+    if os.path.exists(csv_path):
+        print(f"📊 Loading existing results from CSV: {csv_path}")
+        try:
+            import csv
+            with open(csv_path, 'r', encoding='utf-8') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    # Load both successful AND failed results to preserve all data
+                    is_successful = row['Success'] == 'Yes'
+                    
+                    if is_successful:
+                        # Reconstruct successful result dict
+                        result_dict = {
+                            'config': row['Config'],
+                            'video': row['Video'],
+                            'model_id': row['Model ID'],
+                            'resolution': row['Resolution'],
+                            'total_frames': int(row['Total Frames']) if row['Total Frames'].isdigit() else 0,
+                            'prompts_used': int(row['Prompts Used']) if row['Prompts Used'].isdigit() else 1,
+                            'success': True,
+                            'output_file': row['Output File'],
+                            'fps_metrics': {
+                                'overall_fps': float(row['Overall FPS']) if row['Overall FPS'] != 'N/A' else 0,
+                                'min_fps': float(row['Min FPS']) if row['Min FPS'] != 'N/A' else 0,
+                                'max_fps': float(row['Max FPS']) if row['Max FPS'] != 'N/A' else 0,
+                                'avg_fps': float(row['Avg FPS']) if row['Avg FPS'] != 'N/A' else 0,
+                                'std_dev_fps': float(row['Std Dev FPS']) if row['Std Dev FPS'] != 'N/A' else 0,
+                                'cv_percent': float(row['CV %']) if row['CV %'] != 'N/A' else 0
+                            }
+                        }
+                        
+                        # Enhance with JSON metadata if available
+                        output_file = row['Output File']
+                        if output_file in json_metadata:
+                            result_dict['json_metadata'] = json_metadata[output_file]
+                            print(f"    ✅ Enhanced {output_file} with JSON metadata")
+                        
+                        existing_results.append(result_dict)
+                    else:
+                        # Reconstruct failed result dict
+                        existing_results.append({
+                            'config': row['Config'],
+                            'video': row['Video'],
+                            'model_id': row['Model ID'],
+                            'resolution': row['Resolution'],
+                            'total_frames': int(row['Total Frames']) if row['Total Frames'].isdigit() else 0,
+                            'prompts_used': int(row['Prompts Used']) if row['Prompts Used'].isdigit() else 1,
+                            'success': False,
+                            'error': row['Error Message'] if row['Error Message'] else 'Unknown error'
+                        })
+            
+            successful_count = sum(1 for r in existing_results if r['success'])
+            failed_count = len(existing_results) - successful_count
+            print(f"✅ Loaded {len(existing_results)} total results from CSV:")
+            print(f"    - {successful_count} successful")
+            print(f"    - {failed_count} failed")
+        except Exception as e:
+            print(f"⚠️  Warning: Could not load CSV results: {e}")
+    else:
+        print(f"📊 No existing CSV found - will create new results")
+    
+    return existing_results
+
+def check_output_exists(output_dir: str, config_filename: str, video_filename: str, config: Dict, prompts: Optional[List[str]] = None, existing_results: Optional[List[Dict]] = None, retry_failed: bool = False) -> bool:
+    """
+    Check if output file already exists for this config+video combination.
+    
+    Parameters
+    ----------
+    output_dir : str
+        Output directory
+    config_filename : str
+        Config filename (without extension)
+    video_filename : str
+        Video filename (without extension)
+    config : Dict
+        Configuration dictionary
+    prompts : Optional[List[str]]
+        List of prompts (to determine filename format)
+    existing_results : Optional[List[Dict]]
+        List of existing results to check against (for resume functionality)
+    retry_failed : bool, optional
+        Whether to retry previously failed combinations, by default False
+        
+    Returns
+    -------
+    bool
+        True if output file already exists or combination was already processed
+    """
+    # First check if this combination was already processed (from loaded CSV data)
+    if existing_results:
+        for result in existing_results:
+            if result['config'] == config_filename and result['video'] == video_filename:
+                if result['success']:
+                    print(f"    ✅ Combination already completed successfully: {config_filename} + {video_filename}")
+                    return True
+                else:
+                    if retry_failed:
+                        print(f"    🔄 Retrying previously failed combination: {config_filename} + {video_filename} (Previous error: {result.get('error', 'Unknown')})")
+                        return False  # Allow retry
+                    else:
+                        print(f"    ⚠️  Combination previously failed: {config_filename} + {video_filename} (Error: {result.get('error', 'Unknown')})")
+                        return True  # Skip retry
+    
+    # Then check if output file exists on disk
+    # Generate the expected output filename using the same logic as process_video_with_config
+    config_name = config.get('model_id', 'unknown')
+    # Clean up the config name to make it filesystem-safe
+    if '/' in config_name:
+        config_name = config_name.split('/')[-1]
+    if '\\' in config_name:
+        config_name = config_name.split('\\')[-1]
+    # Remove file extensions
+    config_name = config_name.replace('.safetensors', '').replace('.ckpt', '').replace('.pth', '')
+    
+    # Create expected filename
+    num_prompts = len(prompts) if prompts else 1
+    expected_filename = f"{config_filename}_{video_filename}_{config_name}_merged_{num_prompts}prompts.mp4"
+    expected_path = os.path.join(output_dir, expected_filename)
+    
+    exists = os.path.exists(expected_path)
+    if exists:
+        print(f"    ✅ Output file already exists: {expected_filename}")
+    
+    return exists
+
+def process_video_with_config(
+    video: torch.Tensor,
+    config: Dict,
+    prompts: Optional[List[str]] = None,
+    output_dir: str = "./output",
+    config_filename: str = "unknown_config",
+    video_filename: str = "unknown_video",
+    timeout_seconds: int = 600  # 10 minutes timeout per video
+) -> Optional[Dict]:
+    """Process a video with a config, optionally using custom prompts with temporal splitting.
+    
+    Parameters
+    ----------
+    video : torch.Tensor
+        Input video tensor
+    config : Dict
+        Configuration dictionary
+    prompts : Optional[List[str]], optional
+        List of prompts for temporal splitting, by default None
+    output_dir : str, optional
+        Output directory for results, by default "./output"
+    config_filename : str, optional
+        Name of the config file (without extension) for output filename, by default "unknown_config"
+    video_filename : str, optional
+        Name of the video file (without extension) for output filename, by default "unknown_video"
+    timeout_seconds : int, optional
+        Maximum time to spend processing this video, by default 600 (10 minutes)
+    """
+    
+    print(f"\nProcessing with config: {config.get('model_id', 'Unknown')}")
+    print(f"  Timeout set to {timeout_seconds} seconds")
+    
+    # Track start time for timeout
+    start_time = time.time()
+    
+    # Clean GPU state before building pipeline
+    cleanup_gpu_memory()
+    log_memory_usage("before pipeline creation")
+    
+    stream = None
+    try:
+        # Check timeout before starting
+        if time.time() - start_time > timeout_seconds:
+            raise TimeoutError(f"Timeout exceeded before starting processing")
+        
+        # Create wrapper using config system
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        torch_dtype = torch.float16
+        
+        overrides = {
+            'device': device,
+            'dtype': torch_dtype,
+            'output_type': 'pt',
+        }
+        
+        print("  Creating pipeline...")
+        stream = create_wrapper_from_config(config, **overrides)
+        log_memory_usage("after pipeline creation")
+        
+        if stream is None:
+            raise RuntimeError("Failed to create pipeline - stream is None")
+        
+        # Check timeout after pipeline creation
+        if time.time() - start_time > timeout_seconds:
+            raise TimeoutError(f"Timeout exceeded after pipeline creation")
+        
+        # Debug ControlNet setup
+        print(f"  Stream created successfully")
+        if hasattr(stream, 'preprocessors'):
+            print(f"  ControlNet preprocessors found: {len(stream.preprocessors)}")
+            for idx, preproc in enumerate(stream.preprocessors):
+                if preproc:
+                    print(f"    Preprocessor {idx}: {preproc.__class__.__name__}")
+                    if hasattr(preproc, 'params'):
+                        print(f"      Params: {preproc.params}")
+                else:
+                    print(f"    Preprocessor {idx}: None")
+        else:
+            print(f"  No ControlNet preprocessors found on stream")
+        
+        # Check if ControlNet images are available
+        if hasattr(stream, 'controlnet_images'):
+            print(f"  ControlNet images available: {len(stream.controlnet_images)}")
+            for idx, img in enumerate(stream.controlnet_images):
+                if img is not None:
+                    print(f"    ControlNet {idx} image shape: {img.shape if hasattr(img, 'shape') else 'Unknown'}")
+                else:
+                    print(f"    ControlNet {idx} image: None")
+        
+        # Check what ControlNet methods are available
+        controlnet_methods = []
+        if hasattr(stream, 'update_control_image'):
+            controlnet_methods.append('update_control_image')
+        if hasattr(stream, 'update_control_image_efficient'):
+            controlnet_methods.append('update_control_image_efficient')
+        if hasattr(stream, 'stream') and hasattr(stream.stream, 'update_control_image'):
+            controlnet_methods.append('stream.update_control_image')
+        if hasattr(stream, 'stream') and hasattr(stream.stream, 'update_control_image_efficient'):
+            controlnet_methods.append('stream.update_control_image_efficient')
+        
+        print(f"  Available ControlNet methods: {controlnet_methods}")
+        
+        # Check if we have a nested stream structure
+        if hasattr(stream, 'stream'):
+            print(f"  Stream has nested stream object")
+            if hasattr(stream.stream, 'preprocessors'):
+                print(f"  Nested stream has {len(stream.stream.preprocessors)} preprocessors")
+        else:
+            print(f"  Stream is direct (no nested structure)")
+        
+        # Get base prompt from config if no custom prompts
+        if not prompts:
+            prompt_config = config.get('prompt_blending', {})
+            if isinstance(prompt_config, dict) and 'prompt_list' in prompt_config:
+                first_prompt = prompt_config['prompt_list'][0][0] if prompt_config['prompt_list'] else "a beautiful landscape"
+            else:
+                first_prompt = config.get('prompt', 'a beautiful landscape')
+            prompts = [first_prompt]
+        
+        # Calculate frames per prompt for temporal splitting
+        total_frames = video.shape[0]
+        frames_per_prompt = total_frames // len(prompts)
+        remaining_frames = total_frames % len(prompts)
+        
+        print(f"  Total frames: {total_frames}, Frames per prompt: {frames_per_prompt}")
+        print(f"  Remaining frames: {remaining_frames} (will be distributed to first prompts)")
+        
+        # Process each prompt against its time segment and accumulate results
+        fps_metrics = []  # Track FPS for each segment
+        segment_times = []  # Track actual processing time for each segment
+        all_output_frames = []  # Accumulate all segments for final merged video
+        
+        for i, prompt in enumerate(prompts):
+            # Check timeout before processing each prompt
+            if time.time() - start_time > timeout_seconds:
+                raise TimeoutError(f"Timeout exceeded while processing prompt {i+1}")
+                
+            print(f"  Processing prompt {i+1}/{len(prompts)}: '{prompt[:50]}...'")
+            
+            try:
+                # Calculate frame range for this prompt
+                start_frame = i * frames_per_prompt
+                end_frame = start_frame + frames_per_prompt
+                
+                # Distribute remaining frames to first prompts
+                if i < remaining_frames:
+                    end_frame += 1
+                
+                # Get frames for this time segment
+                segment_frames = video[start_frame:end_frame]
+                print(f"    Processing frames {start_frame+1}-{end_frame} ({len(segment_frames)} frames)")
+                
+                # Update stream with new prompt (no pipeline restart needed)
+                stream.update_prompt(prompt)
+                
+                # Prepare the stream if this is the first prompt
+                if i == 0:
+                    stream.prepare(
+                        prompt=prompt,
+                        negative_prompt=config.get('negative_prompt', ''),
+                        num_inference_steps=config.get('num_inference_steps', 35),
+                        guidance_scale=config.get('guidance_scale', 1.5),
+                    )
+                
+                # Process frames for this time segment
+                print("  Processing frames...")
+                segment_start_time = time.time()
+                
+                # Create output tensor for this segment
+                height, width = segment_frames.shape[1], segment_frames.shape[2]
+                segment_result = torch.zeros(len(segment_frames), height, width, 3, dtype=torch.float32)
+                
+                # Warmup on first frame if this is the first prompt
+                if i == 0:
+                    print("  Warming up...")
+                    try:
+                        for _ in range(min(stream.batch_size, 3)):  # Limit warmup to prevent memory issues
+                            warmup_result = stream(image=segment_frames[0].permute(2, 0, 1))
+                            if warmup_result is None:
+                                print("    Warning: Warmup returned None")
+                    except Exception as e:
+                        print(f"    Warning: Warmup failed: {e}")
+                
+                # Process frames for this time segment
+                for j in tqdm(range(len(segment_frames)), desc="  Processing frames"):
+                    # Check timeout periodically during frame processing
+                    if j % 10 == 0 and time.time() - start_time > timeout_seconds:
+                        raise TimeoutError(f"Timeout exceeded while processing frame {j}")
+                        
+                    try:
+                        # Get the input frame
+                        input_frame = segment_frames[j].permute(2, 0, 1)
+                        
+                        # Apply ControlNet preprocessing if available
+                        if hasattr(stream, 'preprocessors') and stream.preprocessors:
+                            # Convert frame to PIL Image for ControlNet preprocessing
+                            import torchvision.transforms.functional as F
+                            frame_pil = F.to_pil_image(input_frame)
+                            
+                            # Update control image for each ControlNet - call directly on the wrapper
+                            for cn_idx in range(len(stream.preprocessors)):
+                                if stream.preprocessors[cn_idx]:
+                                    try:
+                                        stream.update_control_image(index=cn_idx, image=frame_pil)
+                                    except Exception as e:
+                                        print(f"      Warning: ControlNet {cn_idx} update failed: {e}")
+                        elif hasattr(stream, 'stream') and hasattr(stream.stream, 'preprocessors') and stream.stream.preprocessors:
+                            # Handle nested stream structure - still call update_control_image on the wrapper
+                            import torchvision.transforms.functional as F
+                            frame_pil = F.to_pil_image(input_frame)
+                            
+                            # Update control image for each nested ControlNet - call on wrapper, not nested stream
+                            for cn_idx in range(len(stream.stream.preprocessors)):
+                                if stream.stream.preprocessors[cn_idx]:
+                                    try:
+                                        stream.update_control_image(index=cn_idx, image=frame_pil)
+                                    except Exception as e:
+                                        print(f"      Warning: Nested ControlNet {cn_idx} update failed: {e}")
+                        
+                        # Process frame through the stream - ControlNet preprocessing has been applied above
+                        output_image = stream(image=input_frame)
+                        
+                        if output_image is None:
+                            print(f"    Warning: Frame {j} returned None, skipping")
+                            continue
+                        
+                        # Handle batch dimension if present
+                        if output_image.dim() == 4:
+                            segment_result[j] = output_image.squeeze(0).permute(1, 2, 0).clamp(0, 1)
+                        elif output_image.dim() == 3:
+                            segment_result[j] = output_image.permute(1, 2, 0).clamp(0, 1)
+                        else:
+                            print(f"    Warning: unexpected tensor dimensions: {output_image.shape}")
+                            continue
+                            
+                    except Exception as e:
+                        print(f"    Error processing frame {j}: {e}")
+                        # Continue with next frame instead of failing completely
+                        continue
+                
+                processing_time = time.time() - segment_start_time
+                effective_fps = len(segment_frames) / processing_time
+                fps_metrics.append(effective_fps)
+                segment_times.append(processing_time)  # Store actual processing time
+                print(f"  Processed {len(segment_frames)} frames in {processing_time:.2f}s ({effective_fps:.2f} FPS)")
+                
+                # Add segment frames to overall result for final merged video
+                all_output_frames.append(segment_result)
+                
+                # Clean up segment processing memory
+                del segment_result
+                import gc
+                gc.collect()
+                
+                # Clean up GPU memory after each segment
+                cleanup_gpu_memory()
+                log_memory_usage(f"after segment {i+1} completion")
+                
+            except Exception as e:
+                print(f"  ERROR processing prompt {i+1}: {e}")
+                import traceback
+                traceback.print_exc()
+                # Continue with next prompt instead of failing completely
+                continue
+        
+        if not all_output_frames:
+            raise RuntimeError("No segments were processed successfully")
+        
+        # Combine all segments into final merged video
+        print("  Combining all prompt segments...")
+        final_video = torch.cat(all_output_frames, dim=0)
+        
+        # Save final merged video with unique name per config
+        config_name = config.get('model_id', 'unknown')
+        # Clean up the config name to make it filesystem-safe
+        if '/' in config_name:
+            config_name = config_name.split('/')[-1]
+        if '\\' in config_name:
+            config_name = config_name.split('\\')[-1]
+        # Remove file extensions
+        config_name = config_name.replace('.safetensors', '').replace('.ckpt', '').replace('.pth', '')
+        
+        # Create unique filename for this config and video (merged from all prompts)
+        # Include config filename, model_id, and video name for clear identification
+        output_filename = f"{config_filename}_{video_filename}_{config_name}_merged_{len(prompts)}prompts.mp4"
+        
+        # Clean filename to ensure it's filesystem-safe
+        import re
+        output_filename = re.sub(r'[<>:"/\\|?*]', '_', output_filename)  # Replace invalid chars
+        output_filename = output_filename[:200] + '.mp4' if len(output_filename) > 200 else output_filename  # Limit length
+        
+        output_video_path = os.path.join(output_dir, output_filename)
+        
+        # Ensure output directory exists before writing video
+        os.makedirs(output_dir, exist_ok=True)
+        print(f"  Saving video to: {output_video_path}")
+        print(f"  Output directory: {output_dir}")
+        print(f"  Directory exists: {os.path.exists(output_dir)}")
+        
+        # Convert to uint8 and save
+        final_video_uint8 = (final_video * 255).clamp(0, 255).to(torch.uint8)
+        
+        try:
+            write_video(output_video_path, final_video_uint8, fps=30)
+            print(f"  ✅ Saved merged video: {output_video_path}")
+        except Exception as video_error:
+            print(f"  ❌ Failed to save video: {video_error}")
+            print(f"     Output path: {output_video_path}")
+            print(f"     Path length: {len(output_video_path)}")
+            print(f"     Parent dir exists: {os.path.exists(os.path.dirname(output_video_path))}")
+            print(f"     Video shape: {final_video_uint8.shape}")
+            raise video_error
+        finally:
+            # CRITICAL: Clean up large video tensors immediately after saving
+            print("  Cleaning up video tensors from system RAM...")
+            try:
+                del final_video_uint8
+                del final_video
+                del all_output_frames
+                # Force immediate garbage collection
+                import gc
+                gc.collect()
+                print("  ✅ Video tensors cleaned from system RAM")
+            except Exception as cleanup_err:
+                print(f"  ⚠️  Warning: Video tensor cleanup failed: {cleanup_err}")
+        
+        # Calculate overall FPS metrics CORRECTLY
+        total_processing_time = sum(segment_times)  # Sum of actual processing times
+        overall_fps = total_frames / total_processing_time if total_processing_time > 0 else 0
+        min_fps = min(fps_metrics) if fps_metrics else 0
+        max_fps = max(fps_metrics) if fps_metrics else 0
+        avg_fps = sum(fps_metrics) / len(fps_metrics) if fps_metrics else 0
+        
+        # Calculate consistency metrics
+        if len(fps_metrics) > 1:
+            variance = sum((fps - avg_fps) ** 2 for fps in fps_metrics) / len(fps_metrics)
+            std_dev_fps = variance ** 0.5
+            cv_percent = (std_dev_fps / avg_fps) * 100 if avg_fps > 0 else 0
+        else:
+            std_dev_fps = 0
+            cv_percent = 0
+        
+        print(f"  Overall Performance:")
+        print(f"    Total processing time: {total_processing_time:.2f}s")
+        print(f"    Overall FPS: {overall_fps:.2f}")
+        print(f"    FPS range: {min_fps:.2f} - {max_fps:.2f}")
+        print(f"    Average FPS: {avg_fps:.2f}")
+        print(f"    Standard Deviation: {std_dev_fps:.2f}")
+        print(f"    Coefficient of Variation: {cv_percent:.1f}%")
+        
+        # Create comprehensive metadata for JSON storage
+        video_metadata = {
+            'video_info': {
+                'config_filename': config_filename,
+                'video_filename': video_filename,
+                'config_name': config_name,
+                'output_filename': output_filename,
+                'output_path': output_video_path,
+                'total_frames': total_frames,
+                'prompts_used': len(prompts),
+                'prompts': prompts,
+                'processing_date': datetime.datetime.now().isoformat(),
+            },
+            'config_details': {
+                'model_id': config.get('model_id', 'Unknown'),
+                'width': config.get('width', 'Unknown'),
+                'height': config.get('height', 'Unknown'),
+                'num_inference_steps': config.get('num_inference_steps', 'Unknown'),
+                'guidance_scale': config.get('guidance_scale', 'Unknown'),
+                'negative_prompt': config.get('negative_prompt', ''),
+            },
+            'performance_metrics': {
+                'overall_fps': overall_fps,
+                'min_fps': min_fps,
+                'max_fps': max_fps,
+                'avg_fps': avg_fps,
+                'std_dev_fps': std_dev_fps,
+                'cv_percent': cv_percent,
+                'segment_fps': fps_metrics,
+                'segment_times': segment_times,
+                'total_processing_time': total_processing_time,
+                'segments_processed': len(fps_metrics)
+            },
+            'technical_details': {
+                'timeout_seconds': timeout_seconds,
+                'start_time': start_time,
+                'end_time': time.time(),
+                'success': True
+            }
+        }
+        
+        # Save metadata as JSON file alongside video
+        json_filename = output_filename.replace('.mp4', '_metadata.json')
+        json_path = os.path.join(output_dir, json_filename)
+        
+        try:
+            import json
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(video_metadata, f, indent=2, ensure_ascii=False)
+            print(f"  ✅ Saved metadata: {json_filename}")
+        except Exception as json_error:
+            print(f"  ⚠️  Warning: Failed to save metadata JSON: {json_error}")
+        
+        # Return result with FPS metrics and output file
+        return {
+            'output_file': output_filename,  # Just the filename, not full path
+            'metadata_file': json_filename,  # JSON metadata filename
+            'fps_metrics': {
+                'overall_fps': overall_fps,
+                'min_fps': min_fps,
+                'max_fps': max_fps,
+                'avg_fps': avg_fps,
+                'std_dev_fps': std_dev_fps,
+                'cv_percent': cv_percent,
+                'segment_fps': fps_metrics,
+                'segment_times': segment_times,  # Add segment times for debugging
+                'total_processing_time': total_processing_time
+            }
+        }
+        
+    except TimeoutError as e:
+        print(f"  TIMEOUT ERROR: {e}")
+        return None
+    except Exception as e:
+        print(f"  ERROR processing: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+        
+    finally:
+        # Always cleanup, even if there was an error
+        print("  Cleaning up pipeline...")
+        try:
+            if stream is not None:
+                # Use the dedicated cleanup function
+                cleanup_pipeline(stream)
+                stream = None
+                
+        except Exception as cleanup_error:
+            print(f"    Warning: Cleanup failed: {cleanup_error}")
+        finally:
+            # Force cleanup regardless of any errors
+            cleanup_gpu_memory()
+            print("    GPU memory cleanup completed")
+
+def get_memory_info() -> Dict[str, float]:
+    """Get current GPU and system memory information."""
+    memory_info = {}
+    
+    # GPU memory
+    if torch.cuda.is_available():
+        memory_info['gpu_allocated'] = torch.cuda.memory_allocated() / (1024**3)  # GB
+        memory_info['gpu_reserved'] = torch.cuda.memory_reserved() / (1024**3)    # GB
+        memory_info['gpu_free'] = (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved()) / (1024**3)  # GB
+    
+    # System memory
+    try:
+        import psutil
+        memory_info['system_ram_used'] = psutil.virtual_memory().used / (1024**3)  # GB
+        memory_info['system_ram_available'] = psutil.virtual_memory().available / (1024**3)  # GB
+        memory_info['system_ram_percent'] = psutil.virtual_memory().percent
+    except ImportError:
+        # psutil not available, use basic info
+        import os
+        if hasattr(os, 'sysconf'):
+            try:
+                memory_info['system_ram_used'] = 'N/A (psutil not available)'
+            except:
+                pass
+    
+    return memory_info
+
+def log_memory_usage(stage: str):
+    """Log current memory usage for debugging."""
+    memory_info = get_memory_info()
+    if memory_info:
+        gpu_info = f"GPU allocated: {memory_info['gpu_allocated']:.2f} GB, reserved: {memory_info['gpu_reserved']:.2f} GB, free: {memory_info['gpu_free']:.2f} GB"
+        
+        if 'system_ram_used' in memory_info and memory_info['system_ram_used'] != 'N/A (psutil not available)':
+            ram_info = f"RAM used: {memory_info['system_ram_used']:.2f} GB, available: {memory_info['system_ram_available']:.2f} GB ({memory_info['system_ram_percent']:.1f}%)"
+            print(f"  Memory usage at {stage}: {gpu_info}, {ram_info}")
+        else:
+            print(f"  Memory usage at {stage}: {gpu_info}")
+
+def create_video_with_text(input_path: str, output_path: str, text: str, 
+                          width: int, height: int, fontcolor: str = 'white') -> bool:
+    """
+    Create a scaled video with text overlay using ffmpeg-python
+    
+    Args:
+        input_path: Path to input video
+        output_path: Path to output video
+        text: Text to overlay
+        width: Target width
+        height: Target height
+        fontcolor: Color of the text
+    
+    Returns:
+        True if successful, False otherwise
+    """
+    if ffmpeg is None:
+        return False
+        
+    try:
+        # Create the processing pipeline
+        stream = ffmpeg.input(input_path)
+        
+        # Scale video to target size with padding
+        scaled = ffmpeg.filter(
+            stream, 
+            'scale', 
+            width, height,
+            force_original_aspect_ratio='decrease'
+        )
+        
+        padded = ffmpeg.filter(
+            scaled,
+            'pad',
+            width, height,
+            '(ow-iw)/2', '(oh-ih)/2'
+        )
+        
+        # Add text overlay - specify font file to avoid fontconfig issues on Windows
+        font_path = r'C:/Windows/Fonts/arial.ttf'
+        if os.path.exists(font_path):
+            with_text = ffmpeg.drawtext(
+                padded,
+                text=text,
+                fontfile=font_path,
+                fontcolor=fontcolor,
+                fontsize=20,
+                box=1,
+                boxcolor='black@0.8',
+                boxborderw=5,
+                x=10,
+                y='h-th-10'
+            )
+        else:
+            # Fallback: try without fontfile (may use system default)
+            with_text = ffmpeg.drawtext(
+                padded,
+                text=text,
+                fontcolor=fontcolor,
+                fontsize=20,
+                box=1,
+                boxcolor='black@0.8',
+                boxborderw=5,
+                x=10,
+                y='h-th-10'
+            )
+        
+        # Output with encoding settings
+        output = ffmpeg.output(
+            with_text,
+            output_path,
+            vcodec='libx264',
+            crf=23,
+            preset='medium'
+        )
+        
+        # Run the pipeline with verbose error output
+        ffmpeg.run(output, overwrite_output=True, quiet=True)
+        return True
+        
+    except Exception as e:
+        print(f"    Error processing video: {e}")
+        return False
+
+def create_placeholder_video(output_path: str, text: str, width: int, height: int, duration: float) -> bool:
+    """
+    Create a placeholder video with text using ffmpeg-python
+    
+    Args:
+        output_path: Path to output video
+        text: Text to display
+        width: Video width
+        height: Video height
+        duration: Video duration in seconds
+    
+    Returns:
+        True if successful, False otherwise
+    """
+    if ffmpeg is None:
+        return False
+        
+    try:
+        # Create gray color source
+        color_source = ffmpeg.input(
+            'color=c=gray:s={}x{}:d={}'.format(width, height, duration),
+            f='lavfi'
+        )
+        
+        # Add centered text - specify font file to avoid fontconfig issues on Windows
+        font_path = r'C:/Windows/Fonts/arial.ttf'
+        if os.path.exists(font_path):
+            with_text = ffmpeg.drawtext(
+                color_source,
+                text=text,
+                fontfile=font_path,
+                fontcolor='white',
+                fontsize=16,
+                box=1,
+                boxcolor='black@0.8',
+                boxborderw=5,
+                x='(w-text_w)/2',
+                y='(h-text_h)/2'
+            )
+        else:
+            # Fallback: try without fontfile (may use system default)
+            with_text = ffmpeg.drawtext(
+                color_source,
+                text=text,
+                fontcolor='white',
+                fontsize=16,
+                box=1,
+                boxcolor='black@0.8',
+                boxborderw=5,
+                x='(w-text_w)/2',
+                y='(h-text_h)/2'
+            )
+        
+        # Output
+        output = ffmpeg.output(
+            with_text,
+            output_path,
+            vcodec='libx264',
+            crf=23,
+            preset='medium'
+        )
+        
+        ffmpeg.run(output, overwrite_output=True, quiet=True)
+        return True
+        
+    except Exception as e:
+        print(f"    Error creating placeholder: {e}")
+        return False
+
+def create_video_wall(
+    results: List[Dict], 
+    video_files: List[Path], 
+    config_files: List[Path], 
+    output_dir: str
+) -> Optional[str]:
+    """
+    Create a video wall showing original videos and processed results in a grid layout.
+    
+    Layout:
+    - Top row: Original videos
+    - Subsequent rows: Processed videos for each config
+    
+    Parameters
+    ----------
+    results : List[Dict]
+        List of processing results
+    video_files : List[Path]
+        List of original video files
+    config_files : List[Path]
+        List of config files used
+    output_dir : str
+        Output directory for the video wall
+        
+    Returns
+    -------
+    Optional[str]
+        Path to the created video wall, or None if failed
+    """
+    
+    print("\nCreating video wall...")
+    
+    if ffmpeg is None:
+        print("  Skipping video wall creation - ffmpeg-python not available")
+        return None
+    
+    # Filter successful results only
+    successful_results = [r for r in results if r.get('success', False)]
+    
+    if not successful_results:
+        print("  No successful results found for video wall")
+        return None
+    
+    # Extract unique video and config names from successful results
+    video_names = sorted(list(set([r['video'] for r in successful_results])))
+    config_names = sorted(list(set([r['config'] for r in successful_results])))
+    
+    print(f"  Creating grid: {len(config_names)+1} rows x {len(video_names)} columns")
+    print(f"  Videos: {video_names}")
+    print(f"  Configs: {config_names}")
+    
+    # Create video wall output path
+    wall_output = os.path.join(output_dir, "video_wall.mp4")
+    
+    # Create temporary directory for processing
+    temp_dir = os.path.join(output_dir, "temp_video_wall")
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    # Standard resolution for all videos in the wall
+    wall_video_width = 512
+    wall_video_height = 512
+    
+    try:
+        # Step 1: Process all videos
+        processed_videos = {}
+        min_duration = float('inf')
+        
+        # Get minimum duration first
+        print("  Getting video durations...")
+        all_video_paths = []
+        
+        # Collect original video paths
+        for video_name in video_names:
+            for video_file in video_files:
+                if video_file.stem == video_name:
+                    all_video_paths.append(str(video_file))
+                    break
+        
+        # Collect result video paths
+        for result in successful_results:
+            if 'output_file' in result and result['output_file']:
+                output_file_path = os.path.join(output_dir, result['output_file'])
+                if os.path.exists(output_file_path):
+                    all_video_paths.append(output_file_path)
+        
+        # Get minimum duration
+        for video_path in all_video_paths:
+            try:
+                probe = ffmpeg.probe(video_path)
+                duration = float(probe['format']['duration'])
+                min_duration = min(min_duration, duration)
+            except Exception as e:
+                print(f"    Warning: Could not get duration for {video_path}: {e}")
+        
+        if min_duration == float('inf') or min_duration < 1:
+            min_duration = 10
+        
+        print(f"  Using duration: {min_duration:.2f} seconds")
+        
+        # Process original videos
+        print("  Processing original videos...")
+        for video_name in video_names:
+            # Find the original video file
+            original_video_path = None
+            for video_file in video_files:
+                if video_file.stem == video_name:
+                    original_video_path = str(video_file)
+                    break
+            
+            if not original_video_path:
+                print(f"    Warning: Original video not found for {video_name}")
+                continue
+                
+            scaled_path = os.path.join(temp_dir, f"scaled_original_{video_name.replace(' ', '_')}.mp4")
+            text_content = f"ORIGINAL_{video_name.replace(' ', '_')}"
+            
+            success = create_video_with_text(
+                original_video_path, 
+                scaled_path, 
+                text_content,
+                wall_video_width, 
+                wall_video_height,
+                'white'
+            )
+            
+            if success:
+                processed_videos[('original', video_name)] = scaled_path
+                print(f"    Processed original {video_name}")
+            else:
+                print(f"    Failed to process original {video_name}")
+        
+        # Process result videos with enhanced metadata
+        print("  Processing result videos with enhanced metadata...")
+        for result in successful_results:
+            config_name = result['config']
+            video_name = result['video']
+
+            # Find the output file
+            if 'output_file' not in result or not result['output_file']:
+                print(f"    Warning: No output file for {config_name}_{video_name}")
+                continue
+
+            output_file_path = os.path.join(output_dir, result['output_file'])
+            if not os.path.exists(output_file_path):
+                print(f"    Warning: Output file not found: {output_file_path}")
+                continue
+
+            scaled_path = os.path.join(temp_dir, f"scaled_{config_name}_{video_name.replace(' ', '_')}.mp4")
+
+            # Create enhanced metadata for this result
+            metadata = {
+                'video_info': {
+                    'config_filename': config_name,
+                    'output_filename': result['output_file'],
+                    'total_frames': result.get('total_frames', 0)
+                },
+                'config_details': {
+                    'model_id': result.get('model_id', 'Unknown'),
+                    'width': result.get('resolution', 'Unknown').split('x')[0] if 'x' in str(result.get('resolution', '')) else 'Unknown',
+                    'height': result.get('resolution', 'Unknown').split('x')[1] if 'x' in str(result.get('resolution', '')) else 'Unknown'
+                },
+                'performance_metrics': {
+                    'overall_fps': result.get('fps_metrics', {}).get('overall_fps', 0),
+                    'avg_fps': result.get('fps_metrics', {}).get('avg_fps', 0),
+                    'total_processing_time': result.get('fps_metrics', {}).get('total_processing_time', 0)
+                }
+            }
+
+            if create_enhanced_video_with_metadata:
+                success = create_enhanced_video_with_metadata(
+                    output_file_path,
+                    scaled_path,
+                    metadata,
+                    wall_video_width,
+                    wall_video_height
+                )
+            else:
+                # Fallback to regular text overlay if enhanced function not available
+                fps_metrics = result.get('fps_metrics', {})
+                avg_fps = fps_metrics.get('avg_fps', 0)
+                text_content = f"{config_name}_{avg_fps:.1f}_FPS"
+                success = create_video_with_text(
+                    output_file_path,
+                    scaled_path,
+                    text_content,
+                    wall_video_width,
+                    wall_video_height,
+                    'yellow'
+                )
+
+            if success:
+                processed_videos[(config_name, video_name)] = scaled_path
+                fps_metrics = result.get('fps_metrics', {})
+                avg_fps = fps_metrics.get('avg_fps', 0)
+                print(f"    Processed {config_name}_{video_name} (FPS: {avg_fps:.1f})")
+            else:
+                print(f"    Failed to process {config_name}_{video_name}")
+        
+        # Step 2: Create the video wall grid with flipped layout
+        print("  Assembling video wall with flipped layout...")
+        print(f"  New layout: {len(video_names)} rows x {len(config_names) + 1} columns")
+        print(f"  Each row = one original video + all its config outputs")
+        print(f"  Each column = one config (including original)")
+
+        # Collect all input streams for the grid
+        input_streams = []
+
+        # Build grid row by row - each row represents one original video
+        for row_idx, video_name in enumerate(video_names):
+            row_streams = []
+
+            # For each column (config), add the corresponding video
+            for col_idx, config_name in enumerate(['original'] + config_names):
+                if (config_name, video_name) in processed_videos:
+                    # Use existing processed video
+                    stream = ffmpeg.input(processed_videos[(config_name, video_name)])
+                    row_streams.append(stream)
+                else:
+                    # Create placeholder
+                    placeholder_path = os.path.join(temp_dir, f"placeholder_{row_idx}_{col_idx}.mp4")
+                    placeholder_text = f"MISSING_{config_name}_{video_name}".replace(' ', '_')
+
+                    success = create_placeholder_video(
+                        placeholder_path,
+                        placeholder_text,
+                        wall_video_width,
+                        wall_video_height,
+                        min_duration
+                    )
+
+                    if success:
+                        stream = ffmpeg.input(placeholder_path)
+                        row_streams.append(stream)
+                    else:
+                        print(f"    Failed to create placeholder for {config_name}_{video_name}")
+                        return None
+
+            # Horizontally stack this row (configs for one video)
+            if len(row_streams) > 1:
+                row_combined = ffmpeg.filter(row_streams, 'hstack', inputs=len(row_streams))
+            else:
+                row_combined = row_streams[0]
+
+            input_streams.append(row_combined)
+
+        # Vertically stack all rows (different videos)
+        if len(input_streams) > 1:
+            final_grid = ffmpeg.filter(input_streams, 'vstack', inputs=len(input_streams))
+        else:
+            final_grid = input_streams[0]
+        
+        # Trim to minimum duration and output
+        trimmed = ffmpeg.filter(final_grid, 'trim', duration=min_duration)
+        final_output = ffmpeg.output(
+            trimmed,
+            wall_output,
+            vcodec='libx264',
+            crf=20,
+            preset='medium'
+        )
+        
+        print("  Running ffmpeg to create final video wall...")
+        ffmpeg.run(final_output, overwrite_output=True, quiet=True)
+        
+        print(f"  ✅ Video wall created: {wall_output}")
+        
+        # Clean up temporary files
+        print("  Cleaning up temporary files...")
+        try:
+            import shutil
+            shutil.rmtree(temp_dir)
+        except Exception as e:
+            print(f"    Warning: Could not clean up temp directory: {e}")
+        
+        return wall_output
+        
+    except Exception as e:
+        print(f"  Error creating video wall: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+def main(
+    configs: str,
+    videos: str,
+    output: str = "./output-test",
+    prompts: Optional[str] = None,
+    timeout_seconds: int = 300,  # 5 minutes timeout per video
+    resume: Optional[str] = None,  # Resume from existing output directory
+    retry_failed: bool = False  # Whether to retry previously failed combinations
+):
+    """
+    Test multiple configs against multiple videos.
+    
+    Parameters
+    ----------
+    configs : str
+        Directory containing YAML configuration files
+    videos : str
+        Directory containing video files
+    output : str, optional
+        Output directory for results, by default "./output-test"
+    prompts : str, optional
+        Text file containing individual prompts (one per line)
+    timeout_seconds : int, optional
+        Maximum time to spend processing each video, by default 300 (5 minutes)
+    resume : str, optional
+        Resume from existing output directory (full path to directory)
+    retry_failed : bool, optional
+        Whether to retry previously failed combinations, by default False
+    """
+    
+    # Handle resume vs new run
+    if resume:
+        if not os.path.exists(resume):
+            print(f"❌ Error: Resume directory does not exist: {resume}")
+            return
+        output_dir = resume
+        print(f"🔄 Resuming from existing directory: {output_dir}")
+    else:
+        # Create timestamped output directory
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_dir = f"{output}/{timestamp}"
+        os.makedirs(output_dir, exist_ok=True)
+        print(f"🆕 Starting new run in directory: {output_dir}")
+    
+    print("StreamDiffusion Multi-Config Test Suite")
+    print("=" * 50)
+    print(f"Configs directory: {configs}")
+    print(f"Videos directory: {videos}")
+    print(f"Output directory: {output_dir}")
+    if prompts:
+        print(f"Prompts file: {prompts}")
+    if resume:
+        print(f"Resume mode: ✅ Enabled")
+        if retry_failed:
+            print(f"Retry failed: ✅ Enabled (will retry previously failed combinations)")
+        else:
+            print(f"Retry failed: ❌ Disabled (will skip previously failed combinations)")
+    print("=" * 50)
+    
+    # Load prompts if provided
+    prompt_list = None
+    if prompts:
+        if not os.path.exists(prompts):
+            print(f"Error: Prompts file not found: {prompts}")
+            return
+        prompt_list = load_prompts(prompts)
+    
+    # Scan for completed work if resuming
+    existing_results = []
+    if resume:
+        existing_results = scan_completed_work(output_dir)
+    
+    # Get config files
+    config_dir = Path(configs)
+    config_files = list(config_dir.glob("*.yaml")) + list(config_dir.glob("*.yml"))
+    if not config_files:
+        print(f"Error: No YAML config files found in {configs}")
+        return
+    
+    # Get video files
+    video_dir = Path(videos)
+    video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.flv']
+    video_files = []
+    for ext in video_extensions:
+        video_files.extend(video_dir.glob(f"*{ext}"))
+    
+    if not video_files:
+        print(f"Error: No video files found in {videos}")
+        return
+    
+    print(f"\nFound {len(config_files)} configs and {len(video_files)} videos")
+    
+    # Calculate total work
+    total_combinations = len(config_files) * len(video_files)
+    
+    print(f"\n📊 Work Summary:")
+    print(f"  Total combinations: {total_combinations}")
+    if resume and len(existing_results) > 0:
+        print(f"  Previously completed: {len(existing_results)}")
+    print(f"  Will check each combination for existing output files...")
+    
+    # Store results for performance summary (start with existing results)
+    results = existing_results.copy()
+    
+    # Process each config against each video
+    for config_path in config_files:
+        print(f"\n{'='*60}")
+        print(f"Processing config: {config_path.stem}")
+        print(f"{'='*60}")
+        
+        # Aggressive cleanup before starting new config to ensure clean slate
+        print(f"Pre-config cleanup for {config_path.stem}...")
+        for cleanup_round in range(2):
+            cleanup_gpu_memory()
+            import gc
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+            import time
+            time.sleep(0.1)
+        log_memory_usage(f"before config {config_path.stem}")
+        
+        try:
+            config = load_config(config_path)
+            print(f"Config loaded: {config.get('model_id', 'Unknown')}")
+            print(f"Resolution: {config.get('width', 'Unknown')}x{config.get('height', 'Unknown')}")
+        except Exception as e:
+            print(f"Error loading config {config_path}: {e}")
+            continue
+        
+        for video_path in video_files:
+            print(f"\nProcessing video: {video_path.name}")
+            
+            # Check if output already exists (pass existing_results for resume functionality)
+            if check_output_exists(output_dir, config_path.stem, video_path.stem, config, prompt_list, existing_results, retry_failed):
+                print(f"  ⏭️  Skipping - already processed")
+                continue
+            
+            try:
+                print(f"  Starting video preprocessing...")
+                # Preprocess video
+                video = preprocess_video(
+                    str(video_path),
+                    config.get('width', 512),
+                    config.get('height', 512)
+                )
+                print(f"  Video preprocessing completed, shape: {video.shape}")
+                
+                # Force CPU memory cleanup after video preprocessing
+                import gc
+                gc.collect()
+                
+                print(f"  Starting video processing with config...")
+                # Process with config and get performance data
+                result = process_video_with_config(
+                    video=video,
+                    config=config,
+                    prompts=prompt_list,
+                    output_dir=output_dir,
+                    config_filename=config_path.stem,
+                    video_filename=video_path.stem,
+                    timeout_seconds=timeout_seconds
+                )
+                
+                print(f"  Video processing completed, result: {'Success' if result else 'Failed'}")
+
+                # Store video information before cleanup
+                video_frames = video.shape[0]
+
+                # Clean up after each video to prevent memory accumulation
+                print(f"  Cleaning up after video {video_path.name}...")
+
+                # Clean up video tensor from CPU memory
+                del video
+                import gc
+                gc.collect()
+
+                # Clean up GPU memory
+                cleanup_gpu_memory()
+                log_memory_usage(f"after video {video_path.name} completion")
+
+                # If retrying, remove the old failed result to avoid duplicates
+                if retry_failed:
+                    results = [r for r in results if not (r['config'] == config_path.stem and r['video'] == video_path.stem)]
+
+                # Store result for summary
+                if result:
+                    results.append({
+                        'config': config_path.stem,
+                        'video': video_path.stem,
+                        'model_id': config.get('model_id', 'Unknown'),
+                        'resolution': f"{config.get('width', 'Unknown')}x{config.get('height', 'Unknown')}",
+                        'total_frames': video_frames,
+                        'prompts_used': len(prompt_list) if prompt_list else 1,
+                        'success': True,
+                        'output_file': result['output_file'], # Store merged video file
+                        'fps_metrics': result['fps_metrics']
+                    })
+                    print(f"  ✅ Successfully processed {video_path.name}")
+                else:
+                    results.append({
+                        'config': config_path.stem,
+                        'video': video_path.stem,
+                        'model_id': config.get('model_id', 'Unknown'),
+                        'resolution': f"{config.get('width', 'Unknown')}x{config.get('height', 'Unknown')}",
+                        'total_frames': video_frames,
+                        'prompts_used': len(prompt_list) if prompt_list else 1,
+                        'success': False,
+                        'error': 'Processing failed'
+                    })
+                    print(f"  ❌ Failed to process {video_path.name}")
+                
+            except Exception as e:
+                print(f"  Failed to process {video_path.name}: {e}")
+                import traceback
+                traceback.print_exc()
+
+                # Store video frames count if video was successfully loaded
+                video_frames = video.shape[0] if 'video' in locals() else 0
+
+                # Clean up video tensor even on failure
+                try:
+                    del video
+                    import gc
+                    gc.collect()
+                    cleanup_gpu_memory()
+                except:
+                    pass  # video might not be defined if error occurred during preprocessing
+
+                # If retrying, remove the old failed result to avoid duplicates
+                if retry_failed:
+                    results = [r for r in results if not (r['config'] == config_path.stem and r['video'] == video_path.stem)]
+                
+                results.append({
+                    'config': config_path.stem,
+                    'video': video_path.stem,
+                    'model_id': config.get('model_id', 'Unknown'),
+                    'resolution': f"{config.get('width', 'Unknown')}x{config.get('height', 'Unknown')}",
+                    'total_frames': 0,
+                    'prompts_used': len(prompt_list) if prompt_list else 1,
+                    'success': False,
+                    'error': str(e)
+                })
+                continue
+        
+        # Force cleanup between configs to ensure memory is cleared
+        print(f"\nCleaning up after config {config_path.stem}...")
+        try:
+            # Multiple rounds of cleanup to ensure everything is freed
+            for cleanup_round in range(3):  # Multiple cleanup rounds like in main.py
+                cleanup_gpu_memory()
+                
+                # Additional cleanup to ensure no lingering references
+                import gc
+                gc.collect()
+                
+                # Force CUDA synchronization to ensure all operations are complete
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                    torch.cuda.empty_cache()
+                
+                # Small delay between cleanup rounds
+                import time
+                time.sleep(0.1)
+            
+            log_memory_usage(f"after config {config_path.stem} completion")
+                
+        except Exception as cleanup_error:
+            print(f"  Warning: Config cleanup failed: {cleanup_error}")
+        
+        print(f"  Config {config_path.stem} cleanup completed")
+        
+        # Update progress tracking
+        total_processed = len([r for r in results if r['success']])
+        total_failed = len([r for r in results if not r['success']])
+        print(f"  Progress: {total_processed + total_failed}/{total_combinations} total tests")
+        print(f"    Successful: {total_processed}, Failed: {total_failed}")
+    
+    # Generate performance summary
+    generate_performance_summary(results, output_dir, prompt_list)
+    
+    # Create video wall if we have successful results
+    # Try enhanced video wall first (with JSON metadata), fallback to regular wall
+    wall_path = None
+    try:
+        from enhanced_video_wall import create_enhanced_video_wall
+        print("\n🎬 Attempting to create enhanced video wall with JSON metadata...")
+        wall_path = create_enhanced_video_wall(output_dir, os.path.join(output_dir, "enhanced_video_wall.mp4"))
+        if wall_path:
+            print(f"✅ Enhanced video wall created: {wall_path}")
+        else:
+            print("⚠️  Enhanced video wall creation failed, falling back to regular wall")
+    except ImportError:
+        print("⚠️  Enhanced video wall module not available, using regular wall")
+    except Exception as e:
+        print(f"⚠️  Enhanced video wall creation failed: {e}, falling back to regular wall")
+    
+    # Fallback to regular video wall if enhanced version failed
+    if not wall_path:
+        wall_path = create_video_wall(results, video_files, config_files, output_dir)
+    
+    # Final summary
+    total_successful = len([r for r in results if r['success']])
+    total_failed = len([r for r in results if not r['success']])
+    
+    print(f"\n🎯 Final Summary:")
+    print(f"  Total combinations: {total_combinations}")
+    if resume:
+        print(f"  Previously completed: {len(existing_results)}")
+        print(f"  Newly processed: {total_successful + total_failed - len(existing_results)}")
+    print(f"  Total successful: {total_successful}")
+    print(f"  Total failed: {total_failed}")
+    print(f"  Success rate: {(total_successful/total_combinations*100):.1f}%")
+    
+    print(f"\n📁 Results saved to: {output_dir}")
+    print(f"📊 Performance summary: {output_dir}/performance_summary.txt")
+    print(f"📋 Detailed CSV: {output_dir}/detailed_results.csv")
+    
+    if wall_path:
+        print(f"🎬 Video wall created: {wall_path}")
+    else:
+        print("🎬 Video wall creation skipped or failed")
+    
+    if resume:
+        print(f"\n💡 To resume again later, use: --resume \"{output_dir}\"")
+
+def generate_performance_summary(results: List[Dict], output_dir: str, prompts: Optional[List[str]] = None):
+    """Generate a performance summary comparing all configs."""
+    
+    if not results:
+        print("No results to summarize")
+        return
+    
+    # Define successful_results early to avoid UnboundLocalError
+    successful_results = [r for r in results if r['success']]
+    
+    summary_file = os.path.join(output_dir, "performance_summary.txt")
+    
+    with open(summary_file, 'w', encoding='utf-8') as f:
+        f.write("StreamDiffusion Multi-Config Performance Summary\n")
+        f.write("=" * 60 + "\n\n")
+        
+        if prompts:
+            f.write(f"Using {len(prompts)} individual prompts with temporal splitting\n\n")
+        
+        # Overall statistics
+        total_tests = len(results)
+        successful_tests = sum(1 for r in results if r['success'])
+        failed_tests = total_tests - successful_tests
+        
+        f.write(f"Overall Results:\n")
+        f.write(f"  Total tests: {total_tests}\n")
+        f.write(f"  Successful: {successful_tests}\n")
+        f.write(f"  Failed: {failed_tests}\n")
+        f.write(f"  Success rate: {successful_tests/total_tests*100:.1f}%\n\n")
+        
+        # Quick Performance Summary Table
+        if successful_results:
+            f.write("Quick Performance Summary:\n")
+            f.write("-" * 120 + "\n")
+            f.write(f"{'Config':<25} {'Video':<15} {'Resolution':<12} {'Overall FPS':<12} {'Avg FPS':<10} {'Min FPS':<10} {'Max FPS':<10} {'Frames':<8}\n")
+            f.write("-" * 120 + "\n")
+            
+            for result in successful_results:
+                fps = result['fps_metrics']
+                f.write(f"{result['config']:<25} {result['video']:<15} {result['resolution']:<12} "
+                       f"{fps['overall_fps']:<12.2f} {fps['avg_fps']:<10.2f} {fps['min_fps']:<10.2f} "
+                       f"{fps['max_fps']:<10.2f} {result['total_frames']:<8}\n")
+            f.write("-" * 120 + "\n\n")
+        
+        # Results by config
+        configs = set(r['config'] for r in results)
+        f.write("Results by Config:\n")
+        f.write("-" * 40 + "\n")
+        
+        for config in sorted(configs):
+            config_results = [r for r in results if r['config'] == config]
+            config_success = sum(1 for r in config_results if r['success'])
+            
+            f.write(f"\n{config}:\n")
+            f.write(f"  Tests: {len(config_results)}/{config_success} successful\n")
+            f.write(f"  Model: {config_results[0]['model_id']}\n")
+            f.write(f"  Resolution: {config_results[0]['resolution']}\n")
+            
+            # List videos processed
+            for result in config_results:
+                status = "✅" if result['success'] else "❌"
+                f.write(f"    {status} {result['video']}")
+                if result['success']:
+                    f.write(f" ({result['total_frames']} frames, {result['prompts_used']} prompts)")
+                    f.write(f" - Overall FPS: {result['fps_metrics']['overall_fps']:.2f}")
+                    f.write(f", Min FPS: {result['fps_metrics']['min_fps']:.2f}")
+                    f.write(f", Max FPS: {result['fps_metrics']['max_fps']:.2f}")
+                    f.write(f", Avg FPS: {result['fps_metrics']['avg_fps']:.2f}")
+                else:
+                    f.write(f" - {result.get('error', 'Unknown error')}")
+                f.write("\n")
+        
+        # Results by video
+        f.write(f"\nResults by Video:\n")
+        f.write("-" * 40 + "\n")
+        
+        videos = set(r['video'] for r in results)
+        for video in sorted(videos):
+            video_results = [r for r in results if r['video'] == video]
+            video_success = sum(1 for r in video_results if r['success'])
+            
+            f.write(f"\n{video}:\n")
+            f.write(f"  Tests: {len(video_results)}/{video_success} successful\n")
+            
+            for result in video_results:
+                status = "✅" if result['success'] else "❌"
+                f.write(f"    {status} {result['config']} ({result['resolution']})")
+                if result['success']:
+                    f.write(f" - {result['total_frames']} frames")
+                    f.write(f" - Overall FPS: {result['fps_metrics']['overall_fps']:.2f}")
+                    f.write(f", Min FPS: {result['fps_metrics']['min_fps']:.2f}")
+                    f.write(f", Max FPS: {result['fps_metrics']['max_fps']:.2f}")
+                    f.write(f", Avg FPS: {result['fps_metrics']['avg_fps']:.2f}")
+                f.write("\n")
+        
+        # Summary of successful outputs
+        if successful_results:
+            f.write(f"\nGenerated Outputs:\n")
+            f.write("-" * 40 + "\n")
+            
+            for result in successful_results:
+                if 'output_file' in result and result['output_file']:
+                    # Extract just the filename from the full path for display
+                    output_filename = os.path.basename(result['output_file'])
+                    f.write(f"✅ {output_filename}\n")
+                else:
+                    f.write(f"✅ {result['config']}_{result['video']}: No output files generated\n")
+        
+        # Performance Analysis and Rankings
+        if successful_results:
+            f.write(f"\nPerformance Analysis:\n")
+            f.write("-" * 40 + "\n")
+            
+            # Overall FPS Rankings
+            f.write(f"\nOverall FPS Rankings (Higher is Better):\n")
+            fps_rankings = sorted(successful_results, key=lambda x: x['fps_metrics']['overall_fps'], reverse=True)
+            for i, result in enumerate(fps_rankings):
+                f.write(f"  {i+1:2d}. {result['config']:30s} - {result['fps_metrics']['overall_fps']:6.2f} FPS")
+                f.write(f" (Avg: {result['fps_metrics']['avg_fps']:5.2f}, Range: {result['fps_metrics']['min_fps']:5.2f}-{result['fps_metrics']['max_fps']:5.2f})\n")
+            
+            # Average FPS Rankings
+            f.write(f"\nAverage FPS Rankings (Higher is Better):\n")
+            avg_fps_rankings = sorted(successful_results, key=lambda x: x['fps_metrics']['avg_fps'], reverse=True)
+            for i, result in enumerate(avg_fps_rankings):
+                f.write(f"  {i+1:2d}. {result['config']:30s} - {result['fps_metrics']['avg_fps']:6.2f} FPS")
+                f.write(f" (Overall: {result['fps_metrics']['overall_fps']:5.2f}, Range: {result['fps_metrics']['min_fps']:5.2f}-{result['fps_metrics']['max_fps']:5.2f})\n")
+            
+            # Performance Statistics
+            f.write(f"\nPerformance Statistics:\n")
+            overall_fps_values = [r['fps_metrics']['overall_fps'] for r in successful_results]
+            avg_fps_values = [r['fps_metrics']['avg_fps'] for r in successful_results]
+            min_fps_values = [r['fps_metrics']['min_fps'] for r in successful_results]
+            max_fps_values = [r['fps_metrics']['max_fps'] for r in successful_results]
+            
+            f.write(f"  Overall FPS - Best: {max(overall_fps_values):.2f}, Worst: {min(overall_fps_values):.2f}, Mean: {sum(overall_fps_values)/len(overall_fps_values):.2f}\n")
+            f.write(f"  Average FPS - Best: {max(avg_fps_values):.2f}, Worst: {min(avg_fps_values):.2f}, Mean: {sum(avg_fps_values)/len(avg_fps_values):.2f}\n")
+            f.write(f"  Min FPS - Best: {max(min_fps_values):.2f}, Worst: {min(min_fps_values):.2f}, Mean: {sum(min_fps_values)/len(min_fps_values):.2f}\n")
+            f.write(f"  Max FPS - Best: {max(max_fps_values):.2f}, Worst: {min(max_fps_values):.2f}, Mean: {sum(max_fps_values)/len(max_fps_values):.2f}\n")
+            
+            # Performance by Resolution
+            f.write(f"\nPerformance by Resolution:\n")
+            resolutions = set(r['resolution'] for r in successful_results)
+            for resolution in sorted(resolutions):
+                res_results = [r for r in successful_results if r['resolution'] == resolution]
+                res_overall_fps = [r['fps_metrics']['overall_fps'] for r in res_results]
+                res_avg_fps = [r['fps_metrics']['avg_fps'] for r in res_results]
+                
+                f.write(f"  {resolution}:\n")
+                f.write(f"    Configs tested: {len(res_results)}\n")
+                f.write(f"    Best Overall FPS: {max(res_overall_fps):.2f} ({[r['config'] for r in res_results if r['fps_metrics']['overall_fps'] == max(res_overall_fps)][0]})\n")
+                f.write(f"    Best Average FPS: {max(res_avg_fps):.2f} ({[r['config'] for r in res_results if r['fps_metrics']['avg_fps'] == max(res_avg_fps)][0]})\n")
+                f.write(f"    Mean Overall FPS: {sum(res_overall_fps)/len(res_overall_fps):.2f}\n")
+                f.write(f"    Mean Average FPS: {sum(res_avg_fps)/len(res_avg_fps):.2f}\n")
+            
+            # Performance by Video
+            f.write(f"\nPerformance by Video:\n")
+            videos = set(r['video'] for r in successful_results)
+            for video in sorted(videos):
+                vid_results = [r for r in successful_results if r['video'] == video]
+                vid_overall_fps = [r['fps_metrics']['overall_fps'] for r in vid_results]
+                vid_avg_fps = [r['fps_metrics']['avg_fps'] for r in vid_results]
+                
+                f.write(f"  {video}:\n")
+                f.write(f"    Configs tested: {len(vid_results)}\n")
+                f.write(f"    Best Overall FPS: {max(vid_overall_fps):.2f} ({[r['config'] for r in vid_results if r['fps_metrics']['overall_fps'] == max(vid_overall_fps)][0]})\n")
+                f.write(f"    Best Average FPS: {max(vid_avg_fps):.2f} ({[r['config'] for r in vid_results if r['fps_metrics']['avg_fps'] == max(vid_avg_fps)][0]})\n")
+                f.write(f"    Mean Overall FPS: {sum(vid_overall_fps)/len(vid_overall_fps):.2f}\n")
+                f.write(f"    Mean Average FPS: {sum(vid_avg_fps)/len(vid_avg_fps):.2f}\n")
+            
+            # Best Config per Video Summary
+            f.write(f"\nBest Config per Video (Overall FPS):\n")
+            f.write("-" * 60 + "\n")
+            for video in sorted(videos):
+                vid_results = [r for r in successful_results if r['video'] == video]
+                best_config = max(vid_results, key=lambda x: x['fps_metrics']['overall_fps'])
+                fps = best_config['fps_metrics']
+                f.write(f"  {video:<20} -> {best_config['config']:<25} ({fps['overall_fps']:6.2f} FPS, Avg: {fps['avg_fps']:5.2f})\n")
+            
+            f.write(f"\nBest Config per Video (Average FPS):\n")
+            f.write("-" * 60 + "\n")
+            for video in sorted(videos):
+                vid_results = [r for r in successful_results if r['video'] == video]
+                best_config = max(vid_results, key=lambda x: x['fps_metrics']['avg_fps'])
+                fps = best_config['fps_metrics']
+                f.write(f"  {video:<20} -> {best_config['config']:<25} ({fps['avg_fps']:6.2f} FPS, Overall: {fps['overall_fps']:5.2f})\n")
+            
+            # Performance Improvement Analysis
+            f.write(f"\nPerformance Improvement Analysis:\n")
+            f.write("-" * 60 + "\n")
+            
+            # Find the best overall config
+            best_overall_config = max(successful_results, key=lambda x: x['fps_metrics']['overall_fps'])
+            best_overall_fps = best_overall_config['fps_metrics']['overall_fps']
+            
+            f.write(f"Best Overall Config: {best_overall_config['config']} ({best_overall_fps:.2f} FPS)\n\n")
+            f.write(f"Performance vs Best (Overall FPS):\n")
+            
+            for result in sorted(successful_results, key=lambda x: x['fps_metrics']['overall_fps'], reverse=True):
+                if result['config'] != best_overall_config['config']:
+                    improvement = ((best_overall_fps - result['fps_metrics']['overall_fps']) / result['fps_metrics']['overall_fps']) * 100
+                    f.write(f"  {result['config']:<30s} - {result['fps_metrics']['overall_fps']:6.2f} FPS")
+                    f.write(f" ({improvement:+.1f}% vs best)\n")
+            
+            # Performance vs Average
+            avg_overall_fps = sum(r['fps_metrics']['overall_fps'] for r in successful_results) / len(successful_results)
+            f.write(f"\nPerformance vs Average ({avg_overall_fps:.2f} FPS):\n")
+            
+            for result in sorted(successful_results, key=lambda x: x['fps_metrics']['overall_fps'], reverse=True):
+                vs_avg = ((result['fps_metrics']['overall_fps'] - avg_overall_fps) / avg_overall_fps) * 100
+                f.write(f"  {result['config']:<30s} - {result['fps_metrics']['overall_fps']:6.2f} FPS")
+                f.write(f" ({vs_avg:+.1f}% vs avg)\n")
+            
+            # Performance Consistency Analysis
+            f.write(f"\nPerformance Consistency Analysis:\n")
+            f.write("-" * 60 + "\n")
+            f.write("Configs ranked by FPS stability (lower variance = more stable):\n")
+            
+            # Calculate FPS variance for each config
+            consistency_data = []
+            for result in successful_results:
+                segment_fps = result['fps_metrics']['segment_fps']
+                if len(segment_fps) > 1:
+                    mean_fps = sum(segment_fps) / len(segment_fps)
+                    variance = sum((fps - mean_fps) ** 2 for fps in segment_fps) / len(segment_fps)
+                    std_dev = variance ** 0.5
+                    cv = (std_dev / mean_fps) * 100  # Coefficient of variation
+                else:
+                    variance = 0
+                    std_dev = 0
+                    cv = 0
+                
+                consistency_data.append({
+                    'config': result['config'],
+                    'mean_fps': result['fps_metrics']['avg_fps'],
+                    'std_dev': std_dev,
+                    'cv': cv,
+                    'min_fps': result['fps_metrics']['min_fps'],
+                    'max_fps': result['fps_metrics']['max_fps'],
+                    'fps_range': result['fps_metrics']['max_fps'] - result['fps_metrics']['min_fps']
+                })
+            
+            # Sort by coefficient of variation (lower = more stable)
+            consistency_data.sort(key=lambda x: x['cv'])
+            
+            for i, data in enumerate(consistency_data):
+                f.write(f"  {i+1:2d}. {data['config']:<30s} - CV: {data['cv']:5.1f}%")
+                f.write(f" (Std: {data['std_dev']:5.2f}, Range: {data['fps_range']:5.2f})\n")
+                f.write(f"      Mean: {data['mean_fps']:6.2f} FPS, Min: {data['min_fps']:5.2f}, Max: {data['max_fps']:5.2f}\n")
+            
+            # Recommendations
+            f.write(f"\nRecommendations:\n")
+            f.write("-" * 60 + "\n")
+            
+            # Best overall performance
+            f.write(f"🏆 Best Overall Performance: {best_overall_config['config']}\n")
+            f.write(f"   - Highest sustained FPS: {best_overall_config['fps_metrics']['overall_fps']:.2f}\n")
+            f.write(f"   - Best for: Maximum throughput scenarios\n\n")
+            
+            # Most consistent performance
+            most_consistent = consistency_data[0]
+            f.write(f"📊 Most Consistent Performance: {most_consistent['config']}\n")
+            f.write(f"   - Lowest variance: {most_consistent['cv']:.1f}% CV\n")
+            f.write(f"   - Best for: Real-time applications requiring stable frame rates\n\n")
+            
+            # Best value (good performance + consistency)
+            # Find config with good balance of performance and consistency
+            balanced_configs = []
+            for data in consistency_data:
+                # Normalize both metrics (0-1 scale)
+                perf_score = data['mean_fps'] / max(d['mean_fps'] for d in consistency_data)
+                consistency_score = 1 - (data['cv'] / max(d['cv'] for d in consistency_data))
+                balanced_score = (perf_score + consistency_score) / 2
+                balanced_configs.append((data['config'], balanced_score, data['mean_fps'], data['cv']))
+            
+            balanced_configs.sort(key=lambda x: x[1], reverse=True)
+            best_balanced = balanced_configs[0]
+            f.write(f"⚖️  Best Balanced (Performance + Consistency): {best_balanced[0]}\n")
+            f.write(f"   - Balanced score: {best_balanced[1]:.3f}\n")
+            f.write(f"   - Performance: {best_balanced[2]:.2f} FPS, Consistency: {best_balanced[3]:.1f}% CV\n")
+            f.write(f"   - Best for: Production environments requiring both speed and reliability\n\n")
+            
+            # Performance tiers
+            f.write(f"📈 Performance Tiers:\n")
+            fps_values = [r['fps_metrics']['overall_fps'] for r in successful_results]
+            fps_values.sort(reverse=True)
+            
+            if len(fps_values) >= 3:
+                top_tier = fps_values[:len(fps_values)//3]
+                mid_tier = fps_values[len(fps_values)//3:2*len(fps_values)//3]
+                bottom_tier = fps_values[2*len(fps_values)//3:]
+                
+                f.write(f"   🥇 Top Tier (≥{min(top_tier):.2f} FPS): {len(top_tier)} configs\n")
+                f.write(f"   🥈 Mid Tier ({min(mid_tier):.2f}-{max(mid_tier):.2f} FPS): {len(mid_tier)} configs\n")
+                f.write(f"   🥉 Bottom Tier (<{max(bottom_tier):.2f} FPS): {len(bottom_tier)} configs\n")
+            
+            f.write(f"\n💡 Usage Tips:\n")
+            f.write(f"   - For maximum speed: Use {best_overall_config['config']}\n")
+            f.write(f"   - For stable real-time: Use {most_consistent['config']}\n")
+            f.write(f"   - For production: Use {best_balanced[0]}\n")
+            f.write(f"   - Consider resolution impact: Higher resolutions generally reduce FPS\n")
+            f.write(f"   - Monitor VRAM usage: Some configs may be more memory-efficient\n")
+            
+            # Best Configs by Use Case
+            f.write(f"\n🎯 Best Configs by Use Case:\n")
+            f.write("-" * 60 + "\n")
+            
+            # Speed-focused use cases
+            f.write(f"🚀 Speed-Focused Use Cases:\n")
+            speed_configs = sorted(successful_results, key=lambda x: x['fps_metrics']['overall_fps'], reverse=True)[:3]
+            for i, result in enumerate(speed_configs):
+                fps = result['fps_metrics']
+                f.write(f"   {i+1}. {result['config']:<25} - {fps['overall_fps']:6.2f} FPS")
+                f.write(f" (Avg: {fps['avg_fps']:5.2f}, CV: {fps['cv_percent']:4.1f}%)\n")
+            
+            # Consistency-focused use cases
+            f.write(f"\n📊 Consistency-Focused Use Cases:\n")
+            consistency_configs = sorted(successful_results, key=lambda x: x['fps_metrics']['cv_percent'])[:3]
+            for i, result in enumerate(consistency_configs):
+                fps = result['fps_metrics']
+                f.write(f"   {i+1}. {result['config']:<25} - CV: {fps['cv_percent']:4.1f}%")
+                f.write(f" (Avg: {fps['avg_fps']:5.2f} FPS, Overall: {fps['overall_fps']:5.2f})\n")
+            
+            # Balanced use cases
+            f.write(f"\n⚖️  Balanced Use Cases (Speed + Consistency):\n")
+            for i, (config, score, mean_fps, cv) in enumerate(balanced_configs[:3]):
+                f.write(f"   {i+1}. {config:<25} - Score: {score:.3f}")
+                f.write(f" (Avg: {mean_fps:5.2f} FPS, CV: {cv:4.1f}%)\n")
+            
+            # Resolution-specific recommendations
+            f.write(f"\n🖼️  Resolution-Specific Recommendations:\n")
+            for resolution in sorted(resolutions):
+                res_results = [r for r in successful_results if r['resolution'] == resolution]
+                best_speed = max(res_results, key=lambda x: x['fps_metrics']['overall_fps'])
+                best_consistency = min(res_results, key=lambda x: x['fps_metrics']['cv_percent'])
+                
+                f.write(f"   {resolution}:\n")
+                f.write(f"     - Best Speed: {best_speed['config']} ({best_speed['fps_metrics']['overall_fps']:.2f} FPS)\n")
+                f.write(f"     - Best Consistency: {best_consistency['config']} (CV: {best_consistency['fps_metrics']['cv_percent']:.1f}%)\n")
+
+    print(f"Performance summary saved to: {summary_file}")
+    
+    # Also save as CSV for easy analysis
+    csv_file = os.path.join(output_dir, "detailed_results.csv")
+    import csv
+    
+    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        
+        # Header
+        writer.writerow([
+            'Config', 'Video', 'Model ID', 'Resolution', 'Total Frames', 
+            'Prompts Used', 'Success', 'Output File', 'Error Message',
+            'Overall FPS', 'Min FPS', 'Max FPS', 'Avg FPS', 'Std Dev FPS', 'CV %'
+        ])
+        
+        # Data rows
+        for result in results:
+            if result['success']:
+                fps_metrics = result['fps_metrics']
+                # Format output file path - just the filename for clarity
+                output_file_str = os.path.basename(result.get('output_file', ''))
+                
+                writer.writerow([
+                    result['config'],
+                    result['video'],
+                    result['model_id'],
+                    result['resolution'],
+                    result['total_frames'],
+                    result['prompts_used'],
+                    "Yes" if result['success'] else "No",
+                    output_file_str,
+                    result.get('error', ''),
+                    f"{fps_metrics['overall_fps']:.2f}",
+                    f"{fps_metrics['min_fps']:.2f}",
+                    f"{fps_metrics['max_fps']:.2f}",
+                    f"{fps_metrics['avg_fps']:.2f}",
+                    f"{fps_metrics['std_dev_fps']:.2f}",
+                    f"{fps_metrics['cv_percent']:.1f}"
+                ])
+            else:
+                writer.writerow([
+                    result['config'],
+                    result['video'],
+                    result['model_id'],
+                    result['resolution'],
+                    result['total_frames'],
+                    result['prompts_used'],
+                    "Yes" if result['success'] else "No",
+                    "",
+                    result.get('error', ''),
+                    "N/A",
+                    "N/A",
+                    "N/A",
+                    "N/A",
+                    "N/A",
+                    "N/A"
+                ])
+    
+    print(f"Detailed results saved to: {csv_file}")
+
+if __name__ == "__main__":
+    try:
+        fire.Fire(main)
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
+        cleanup_and_exit()
+    except Exception as e:
+        print(f"\nUnexpected error in main: {e}")
+        import traceback
+        traceback.print_exc()
+        cleanup_and_exit()
+        sys.exit(1)
+    finally:
+        cleanup_and_exit()
diff --git a/multi_test/prompts.txt b/multi_test/prompts.txt
new file mode 100644
index 00000000..92ed8a60
--- /dev/null
+++ b/multi_test/prompts.txt
@@ -0,0 +1,6 @@
+naruto anime
+marble statue, high detail, roman empire, stones, garden
+playstation, graphics, cutscene, ps2, shenmu
+stained glass dream fantasy
+Disney Aladdin, cartoon, pixar cg
+1930s pinup girl
\ No newline at end of file