invoke-ai · AsuraAce · May 5, 2026 · May 8, 2026 · May 9, 2026
@@ -378,6 +378,10 @@ class ConditioningField(BaseModel):
 class BoundingBoxField(BoundingBox):
     """A bounding box primitive value."""
 
+    label: Optional[str] = Field(
+        default=None,
+        description="The label associated with the bounding box. This value is typically set when the bounding box was produced by a detector.",
+    )
     score: Optional[float] = Field(
         default=None,
         ge=0.0,

@@ -1,3 +1,4 @@
+import re
 from pathlib import Path
 from typing import Literal
 
@@ -18,6 +19,36 @@
     "grounding-dino-tiny": "IDEA-Research/grounding-dino-tiny",
     "grounding-dino-base": "IDEA-Research/grounding-dino-base",
 }
+GROUNDING_DINO_LABEL_SPLIT_PATTERN = re.compile(r"[|,.;]+")
+
+
+def normalize_grounding_dino_label(label: str) -> str:
+    return label.strip().strip("|,.;").strip().lower()
+
+
+def parse_grounding_dino_labels(prompt: str) -> list[str]:
+    labels: list[str] = []
+    seen: set[str] = set()
+
+    for raw_label in GROUNDING_DINO_LABEL_SPLIT_PATTERN.split(prompt):
+        label = normalize_grounding_dino_label(raw_label)
+        if not label or label in seen:
+            continue
+        labels.append(label)
+        seen.add(label)
+
+    return labels
+
+
+def detection_result_to_bounding_box(detection: DetectionResult) -> BoundingBoxField:
+    return BoundingBoxField(
+        x_min=detection.box.xmin,
+        x_max=detection.box.xmax,
+        y_min=detection.box.ymin,
+        y_max=detection.box.ymax,
+        score=detection.score,
+        label=normalize_grounding_dino_label(detection.label),
+    )
 
 
 @invocation(
@@ -49,23 +80,16 @@ class GroundingDinoInvocation(BaseInvocation):
     def invoke(self, context: InvocationContext) -> BoundingBoxCollectionOutput:
         # The model expects a 3-channel RGB image.
         image_pil = context.images.get_pil(self.image.image_name, mode="RGB")
+        labels = parse_grounding_dino_labels(self.prompt)
+        if len(labels) == 0:
+            return BoundingBoxCollectionOutput(collection=[])
 
-        detections = self._detect(
-            context=context, image=image_pil, labels=[self.prompt], threshold=self.detection_threshold
-        )
+        detections = self._detect(context=context, image=image_pil, labels=labels, threshold=self.detection_threshold)
 
         # Convert detections to BoundingBoxCollectionOutput.
         bounding_boxes: list[BoundingBoxField] = []
         for detection in detections:
-            bounding_boxes.append(
-                BoundingBoxField(
-                    x_min=detection.box.xmin,
-                    x_max=detection.box.xmax,
-                    y_min=detection.box.ymin,
-                    y_max=detection.box.ymax,
-                    score=detection.score,
-                )
-            )
+            bounding_boxes.append(detection_result_to_bounding_box(detection))
         return BoundingBoxCollectionOutput(collection=bounding_boxes)
 
     @staticmethod

@@ -177,6 +177,9 @@
         "generation": {
             "title": "Generation"
         },
+        "faceDetailer": {
+            "title": "Detailer"
+        },
         "image": {
             "title": "Image"
         },
@@ -1016,6 +1019,7 @@
         "dypeScale": "$t(parameters.dypeScale)",
         "dypeExponent": "$t(parameters.dypeExponent)",
         "generationMode": "Generation Mode",
+        "detailer": "Detailer",
         "geminiTemperature": "Gemini Temperature",
         "geminiThinkingLevel": "Gemini Thinking Level",
         "openaiQuality": "OpenAI Quality",
@@ -1616,6 +1620,119 @@
         },
         "cfgScale": "CFG Scale",
         "cfgRescaleMultiplier": "CFG Rescale Multiplier",
+        "faceDetailer": {
+            "enabled": "Enable",
+            "quality": "Quality",
+            "adjusted": "Adjusted",
+            "detector": "Detector",
+            "detectorModel": "Detector Model",
+            "samModel": "SAM Model",
+            "targetPrompt": "Target",
+            "detectionThreshold": "Threshold",
+            "faceSelection": "Target Selection",
+            "targetSize": "Target Size",
+            "maxUpscale": "Max Upscale",
+            "maxProcessSize": "Max Process",
+            "cropPadding": "Crop Padding",
+            "maskExpand": "Mask Expand",
+            "maskFeather": "Mask Feather",
+            "denoiseMaskExpand": "Denoise Expand",
+            "denoiseMaskFeather": "Denoise Gradient",
+            "pasteMaskExpand": "Paste Expand",
+            "pasteMaskFeather": "Paste Feather",
+            "colorCorrectMode": "Color Match",
+            "faceId": "Target ID",
+            "minConfidence": "Confidence",
+            "padding": "Padding",
+            "strength": "Denoise",
+            "steps": "Steps",
+            "cfgScale": "CFG Scale",
+            "maskBlur": "Mask Blur",
+            "developerOptions": "Developer Options",
+            "debugOutput": "Debug Output",
+            "legacyDetector": "Legacy Detector",
+            "mediapipeLegacyWarning": "MediaPipe is a legacy face-only detector for development testing. It is not part of the supported DINO/SAM Detailer path.",
+            "effectiveProfile": "Profile",
+            "effectiveValue": "Effective: {{value}}",
+            "bodyProfileActive": "Body Profile Active",
+            "bodyProfileSummary": "Process {{process}} / Strength {{strength}} / Steps {{steps}} / CFG <= {{cfg}}",
+            "cropPaddingTooltip": "Smaller padding gives the target more processing pixels. Larger padding keeps more surrounding context.",
+            "tooltips": {
+                "strength": "How much the detail pass may regenerate the selected crop.",
+                "steps": "How many denoise steps to run for the detail pass.",
+                "faceSelection": "How to choose one target when the detector finds multiple matches.",
+                "faceId": "The zero-based target index used when Target Selection is Index.",
+                "detectionThreshold": "Minimum detector confidence. Lower values find more targets but can add false positives.",
+                "detector": "Choose the target detector used by the Detailer. DINO + SAM is the supported V1 path.",
+                "detectorModel": "GroundingDINO model used to find target boxes. Larger models may improve detection and run slower.",
+                "samModel": "SAM model used to segment the selected target. Larger models may improve masks and run slower.",
+                "targetSize": "Preferred processing size for small detected crops.",
+                "maxUpscale": "Maximum pixel scale factor allowed before denoising the crop.",
+                "maxProcessSize": "Maximum processing resolution for the detail pass.",
+                "denoiseMaskExpand": "Expands the detected mask before denoising.",
+                "denoiseMaskFeather": "Softens the denoise mask edge before the crop is regenerated.",
+                "pasteMaskExpand": "Expands the final paste region.",
+                "pasteMaskFeather": "Softens the final paste edge.",
+                "colorCorrectMode": "Optionally match the detailed crop colors to the original crop before paste.",
+                "cfgScale": "Prompt guidance used by the detail denoise pass.",
+                "minConfidence": "Minimum MediaPipe face confidence for the legacy detector.",
+                "padding": "Extra pixels around the MediaPipe face crop.",
+                "maskBlur": "Legacy MediaPipe mask blur used for the detail denoise mask."
+            },
+            "advancedGroups": {
+                "detection": "Detection",
+                "cropScale": "Crop & Scale",
+                "masksPaste": "Masks & Paste",
+                "denoise": "Denoise"
+            },
+            "targetPresets": {
+                "label": "Target",
+                "face": "Face",
+                "head": "Head",
+                "hands": "Hands",
+                "body": "Body",
+                "custom": "Custom"
+            },
+            "detectors": {
+                "groundedSam": "DINO + SAM",
+                "mediapipe": "MediaPipe (Legacy)"
+            },
+            "dinoModels": {
+                "grounding-dino-tiny": "GroundingDINO Tiny",
+                "grounding-dino-base": "GroundingDINO Base"
+            },
+            "samModels": {
+                "segment-anything-2-small": "SAM 2 Small",
+                "segment-anything-2-tiny": "SAM 2 Tiny",
+                "segment-anything-2-base": "SAM 2 Base",
+                "segment-anything-2-large": "SAM 2 Large",
+                "segment-anything-base": "SAM Base",
+                "segment-anything-large": "SAM Large",
+                "segment-anything-huge": "SAM Huge"
+            },
+            "qualities": {
+                "fast": "Fast",
+                "balanced": "Balanced",
+                "high": "High"
+            },
+            "qualityTooltips": {
+                "fast": "Fast applies visible speed-first starter settings, including the small SAM model.",
+                "balanced": "Balanced applies visible recommended starter settings, including the base SAM model.",
+                "high": "High applies visible quality-first starter settings, including the large SAM model."
+            },
+            "faceSelections": {
+                "highestScore": "Highest Score",
+                "largestArea": "Largest Area",
+                "index": "Index"
+            },
+            "colorCorrectModes": {
+                "off": "Off",
+                "luma": "Luma",
+                "chroma": "Chroma",
+                "ycbcr": "YCbCr",
+                "rgb": "RGB"
+            }
+        },
         "clipSkip": "CLIP Skip",
         "coherenceMode": "Mode",
         "coherenceEdgeSize": "Edge Size",
@@ -2107,6 +2224,109 @@
                 "• Lower Weight (0-.75): Creates a smaller impact on the final result."
             ]
         },
+        "detailerQuality": {
+            "heading": "Detailer Quality",
+            "paragraphs": [
+                "Applies visible starter settings for the detail pass.",
+                "Fast favors speed, Balanced is the recommended default, and High uses larger mask models and processing sizes."
+            ]
+        },
+        "detailerDenoisingStrength": {
+            "heading": "Denoising Strength",
+            "paragraphs": [
+                "Controls how much the selected crop may change during the detail pass.",
+                "Lower values preserve the original image more closely. Higher values repaint more strongly."
+            ]
+        },
+        "detailerSteps": {
+            "heading": "Steps",
+            "paragraphs": ["Number of denoising steps used for the detail pass."]
+        },
+        "detailerTargetSelection": {
+            "heading": "Target Selection",
+            "paragraphs": ["Controls how one target is selected when the detector finds multiple matches."]
+        },
+        "detailerTargetId": {
+            "heading": "Target ID",
+            "paragraphs": ["Zero-based target index used when Target Selection is set to Index."]
+        },
+        "detailerDetectionThreshold": {
+            "heading": "Threshold",
+            "paragraphs": [
+                "Minimum detector confidence for candidate targets.",
+                "Lower values may find more targets, but can also add false positives."
+            ]
+        },
+        "detailerDetector": {
+            "heading": "Detector",
+            "paragraphs": ["Selects the detector path. DINO + SAM is the supported Detailer path for this version."]
+        },
+        "detailerDinoModel": {
+            "heading": "DINO Model",
+            "paragraphs": ["GroundingDINO model used to find target boxes. Larger models may improve detection and run slower."]
+        },
+        "detailerSamModel": {
+            "heading": "SAM Model",
+            "paragraphs": ["Segmentation model used to create the target mask. Larger models may improve masks and run slower."]
+        },
+        "detailerTargetSize": {
+            "heading": "Target Size",
+            "paragraphs": ["Preferred processing size for small detected crops before denoising."]
+        },
+        "detailerMaxUpscale": {
+            "heading": "Max Upscale",
+            "paragraphs": ["Maximum pixel resize factor allowed before the crop is denoised."]
+        },
+        "detailerMaxProcess": {
+            "heading": "Max Process",
+            "paragraphs": ["Maximum processing resolution for the detail pass. This is a hard safety cap."]
+        },
+        "detailerCropPadding": {
+            "heading": "Crop Padding",
+            "paragraphs": [
+                "Extra context added around the detected target.",
+                "Smaller padding gives the target more processing pixels. Larger padding preserves more surrounding context."
+            ]
+        },
+        "detailerDenoiseMaskExpand": {
+            "heading": "Denoise Expand",
+            "paragraphs": ["Expands the detected mask before the detail denoise pass."]
+        },
+        "detailerDenoiseMaskFeather": {
+            "heading": "Denoise Gradient",
+            "paragraphs": ["Softens the denoise mask edge used by the detail pass."]
+        },
+        "detailerPasteMaskExpand": {
+            "heading": "Paste Expand",
+            "paragraphs": ["Expands the final paste region used to composite the detailed crop back into the image."]
+        },
+        "detailerPasteMaskFeather": {
+            "heading": "Paste Feather",
+            "paragraphs": ["Softens the final paste edge when compositing the detailed crop back into the image."]
+        },
+        "detailerColorMatch": {
+            "heading": "Color Match",
+            "paragraphs": ["Optionally matches the detailed crop colors to the original crop before paste."]
+        },
+        "detailerCfgScale": {
+            "heading": "CFG Scale",
+            "paragraphs": [
+                "Prompt guidance used by the detail denoise pass.",
+                "Lower values usually preserve the original crop better. High values can overcook small inpainted regions."
+            ]
+        },
+        "detailerMediapipeConfidence": {
+            "heading": "Confidence",
+            "paragraphs": ["Minimum MediaPipe face confidence for the legacy detector."]
+        },
+        "detailerMediapipePadding": {
+            "heading": "Padding",
+            "paragraphs": ["Extra pixels around the MediaPipe face crop."]
+        },
+        "detailerMediapipeMaskBlur": {
+            "heading": "Mask Blur",
+            "paragraphs": ["Legacy MediaPipe mask blur used for the detail denoise mask."]
+        },
         "dynamicPrompts": {
             "heading": "Dynamic Prompts",
             "paragraphs": [

@@ -26,6 +26,28 @@ export type Feature =
   | 'controlNetProcessor'
   | 'controlNetResizeMode'
   | 'controlNetWeight'
+  | 'detailerCfgScale'
+  | 'detailerColorMatch'
+  | 'detailerCropPadding'
+  | 'detailerDenoiseMaskExpand'
+  | 'detailerDenoiseMaskFeather'
+  | 'detailerDenoisingStrength'
+  | 'detailerDetectionThreshold'
+  | 'detailerDetector'
+  | 'detailerDinoModel'
+  | 'detailerMaxProcess'
+  | 'detailerMaxUpscale'
+  | 'detailerMediapipeConfidence'
+  | 'detailerMediapipeMaskBlur'
+  | 'detailerMediapipePadding'
+  | 'detailerPasteMaskExpand'
+  | 'detailerPasteMaskFeather'
+  | 'detailerQuality'
+  | 'detailerSamModel'
+  | 'detailerSteps'
+  | 'detailerTargetId'
+  | 'detailerTargetSelection'
+  | 'detailerTargetSize'
   | 'dynamicPrompts'
   | 'dynamicPromptsMaxPrompts'
   | 'dynamicPromptsSeedBehaviour'