Lightning-AI · shino16 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025
@@ -15,7 +15,7 @@
 # imports unused in this file, but referenced as thunder.* elsewhere
 from thunder.common import trace
 import thunder.core.devices as devices
-from thunder.core.proxies import Proxy
+from thunder.core.proxies import NumberProxy, Proxy
 
 from thunder.common import (
     CompileData,
@@ -895,7 +895,12 @@ def fn_(*args, **kwargs) -> Any:
         result = call_epilogue(cache_entry, result, pro_to_epi)
 
         # Reflect the state of is_grad_enabled, as its changes were tracked only inside Thunder
-        pytorch.set_grad_enabled(cd.is_grad_enabled)
+        is_grad_enabled = cd.is_grad_enabled
+        if isinstance(is_grad_enabled, NumberProxy):
+            # TODO: Verify this assumption
+            assert is_grad_enabled.is_static_constrained()
+            is_grad_enabled = is_grad_enabled.value
+        pytorch.set_grad_enabled(is_grad_enabled)
 
         cs.last_computation = cache_entry.computation_fn
         return result

@@ -38,6 +38,7 @@
     Proxy,
     ProxyInterface,
     ProxyTag,
+    StringProxy,
     TensorProxy,
     Variable,
     is_proxy_name_available,
@@ -303,15 +304,17 @@ def proxify(self, value: WrappedValue) -> Any:
             assert p.history is not None, f"{p.history}, {value.provenance} {type(p)}"
 
             co: CACHE_OPTIONS = get_cache_option()
-            if co is CACHE_OPTIONS.CONSTANT_VALUES:
+            if co is CACHE_OPTIONS.CONSTANT_VALUES or isinstance(uvalue, bool):
                 if isinstance(uvalue, str):
                     self.add_constraint((clang.check_string_value, p, uvalue))
                 elif isinstance(uvalue, slice):
                     self.add_constraint((clang.check_slice_value, p, uvalue))
                 else:
                     self.add_constraint((clang.check_number_type_and_value, p, uvalue))
             elif co is CACHE_OPTIONS.SYMBOLIC_VALUES:
-                if p is not uvalue:
+                if isinstance(uvalue, str):
+                    self.add_constraint((clang.check_string_value, p, uvalue))
+                elif p is not uvalue:
                     self.add_constraint((clang.check_instance, p, (type(uvalue),)))
                     value.register_proxy(p)
             elif co not in (CACHE_OPTIONS.SAME_INPUT, CACHE_OPTIONS.NO_CACHING):
@@ -468,6 +471,8 @@ def _general_jit_getattr_lookaside(obj: Any, name: str, *maybe_default: Any):
 
 @register_general_jit_lookaside(isinstance)
 def _general_jit_isinstance_lookaside(obj: Any, cls: type | UnionType | tuple[type | UnionType]):
+    from thunder.core.baseutils import check
+
     uobj = unwrap(obj)
     ucls = unwrap(cls)
     if isinstance(uobj, TensorProxy):
@@ -479,6 +484,9 @@ def _general_jit_isinstance_lookaside(obj: Any, cls: type | UnionType | tuple[ty
             ucls = (ucls,)
         if torch.nn.Parameter in ucls:
             res = issubclass(obj.python_typ, ucls)
+    elif isinstance(uobj, NumberProxy):
+        check(uobj.value is not None, lambda: "isinstance does not support NumberProxy with no value")
+        res = isinstance(uobj.value, ucls)
     else:
         res = isinstance(uobj, ucls)
 
@@ -642,7 +650,7 @@ def _general_jit_hasattr_lookaside(obj: Any, name: str):
 def _general_jit_bool_lookaside(wrapped_x: Any) -> bool | INTERPRETER_SIGNALS:
     assert isinstance(wrapped_x, WrappedValue)
     # It doesn't feel right to insert constraints in bool lookaside, constraints here only applies when the bool value is used in control flow.
-    if isinstance(wrapped_x.value, NumberProxy):
+    if isinstance(wrapped_x.value, (NumberProxy, StringProxy)):
         if wrapped_x.value.is_dynamic():
             raise NotImplementedError(f"conversion to bool is not allowed on dynamic proxy={wrapped_x.value}")
         wrapped_x.value.make_static_constrained()

@@ -68,7 +68,7 @@ def _string_to_cache_option(s: str, /) -> None | CACHE_OPTIONS:
 def resolve_cache_option(x: Any, /) -> CACHE_OPTIONS:
     co: None | CACHE_OPTIONS
     if x is None:
-        co = CACHE_OPTIONS.CONSTANT_VALUES
+        co = CACHE_OPTIONS.SYMBOLIC_VALUES
     elif isinstance(x, CACHE_OPTIONS):
         co = x
     elif isinstance(x, str):

@@ -1275,7 +1275,7 @@ def _infer_tensor_properties(
     else:
         # deferred computation of numel
         # TODO: similar to how `shape` is handled, this should be CSE or lifted for efficiency
-        _numel = lambda *args: reduce(operator.mul, _shape, 1)
+        _numel = lambda self: reduce(operator.mul, self.shape, 1)
 
     # TODO Alias rank to ndim?
     _ndim = len(_shape)
@@ -1465,7 +1465,7 @@ def __init__(
             self._device,
             self._dtype,
             self._true_dtype,
-            self._numel,
+            _numel,
             self._ndim,
             self._requires_grad,
             self._grad,
@@ -1482,6 +1482,11 @@ def __init__(
             thunder_fsdp_padding_size,
         )
 
+        if not using_symbolic_values():
+            self._numel = _numel
+        else:
+            self._numel = lambda self=self: _numel(self)
+
     # NOTE The following properties DO NOT depend on the language context or record
     #   themselves into the trace, so they can be used when working with tensor proxies
     #   outside of a trace or language context

@@ -458,7 +458,7 @@ def rematerialize(trace: TraceCtx) -> TraceCtx:
             computed_cuts_for_producers[producer] += cut
 
     rematerialized_trace = from_trace(trace)
-    rematerialized_trace.bound_symbols = tuple(new_bsyms.get(bsym, bsym) for bsym in trace.bound_symbols)
+    rematerialized_trace.bound_symbols = list(new_bsyms.get(bsym, bsym) for bsym in trace.bound_symbols)
 
     end_time_ns = time.perf_counter_ns()
     elapsed_time_ns = end_time_ns - start_time_ns

@@ -350,6 +350,12 @@ def tag_tensorproxy_output_as_detached(proxy):
             exception_type=AssertionError,
         )
 
+        # When using symbolic values, there may be duplicate prims.eq and prims.shape subsymbols that can be removed.
+        from thunder.core.transform_common import dce_bsyms
+
+        subsymbols = dce_bsyms(subsymbols, result)
+        bsym = bsym.from_bsym(subsymbols=subsymbols)
+
         symbols_list.append(bsym)
         return result
 
@@ -447,6 +453,7 @@ def from_bsym_swap_proxies(
         skip_inputs: bool = False,
         skip_output: bool = False,
         skip_subsymbols: bool = False,
+        allow_cycles: bool = False,
     ) -> BoundSymbol:
         """Create a new :class:`BoundSymbol` with its inputs, output, and subsymbols updated with ``swap_map``.
 
@@ -481,9 +488,14 @@ def swap(c):
                     while vfa in swap_map:
                         if swap_map[vfa] is fa:
                             break
-                        baseutils.check(
-                            vfa not in visited, lambda: f"Detected a cycle while swapping; the cycle includes {visited}"
-                        )
+
+                        if vfa in visited:
+                            baseutils.check(
+                                allow_cycles,
+                                lambda: f"Detected a cycle while swapping; the cycle includes {visited}",
+                            )
+                            break
+
                         visited.add(vfa)
 
                         fa = swap_map[vfa]

@@ -262,11 +262,12 @@ class TraceSubstitutionProcessor:
 
     NULL = object()
 
-    def __init__(self, trace, *args, **kwargs):
+    def __init__(self, trace, allow_swap_map_cycles=False, *args, **kwargs):
         self.env = {}
         self.trace = trace
         self.new_trace = from_trace(self.trace)
         self.have_processed_args = False
+        self.allow_swap_map_cycles = allow_swap_map_cycles
 
     def read(self, x: VariableInterface | Any) -> Any:
         if isinstance(x, VariableInterface):
@@ -398,9 +399,9 @@ def __call__(self):
                     for new_bsym in self.new_bsyms:
                         # TODO: what to do with bsym header? Maybe have a combined from_bsym_swap_proxies and from_bsym?
                         self.new_trace.bound_symbols.append(
-                            new_bsym.from_bsym_swap_proxies(self.swap_map).from_bsym(
-                                source_filename=bsym.source_filename, source_positions=bsym.source_positions
-                            )
+                            new_bsym.from_bsym_swap_proxies(
+                                self.swap_map, allow_cycles=self.allow_swap_map_cycles
+                            ).from_bsym(source_filename=bsym.source_filename, source_positions=bsym.source_positions)
                         )
 
                     result = tree_map(self.do_swap, self.replacement_result)

@@ -142,20 +142,32 @@ def keep_or_swap(p):
 #   that only produce non-proxy objects
 # NOTE needed_proxies is an in/out argument, it takes an initial set of Variables you want to keep, and return
 #   all the needed proxies of the input trace
-def dce(trace: Trace, needed_proxies: None | set[Variable] = None) -> Trace:
-    start_time_ns = time.perf_counter_ns()
+def dce_bsyms(
+    bsyms: list[BoundSymbolInterface],
+    output: Any,
+    needed_proxies: None | set[Variable] = None,
+) -> Trace | list[BoundSymbolInterface]:
+    """Runs a Dead Code Elimination (DCE) pass
+
+    Args:
+        bsyms: The list of bound symbols to run the DCE pass on.
+        needed_proxies: The set of variables to keep.
+        output: The output of the list of bound symbols.
 
-    producer_map: ProxyDict = producers(trace)
+    Returns:
+        The list of bound symbols after the DCE pass.
+    """
+    producer_map: ProxyDict = producers(bsyms)
 
-    flat_trace_outputs, _ = tree_flatten(trace.output)
+    flat_trace_outputs, _ = tree_flatten(output)
     if needed_proxies is None:
         needed_proxies: set[Variable] = set(tuple(variableify(x) for x in flat_trace_outputs if isinstance(x, Proxy)))
     else:
         needed_proxies.update(tuple(variableify(x) for x in flat_trace_outputs if isinstance(x, Proxy)))
     dced = []
 
     bsym: BoundSymbol
-    for bsym in reversed(trace.bound_symbols):
+    for bsym in reversed(bsyms):
         # Preserves symbols that should never be collected
         if has_tags(bsym, {prims.OpTags.DONT_DCE}):
             needed = True
@@ -182,19 +194,28 @@ def dce(trace: Trace, needed_proxies: None | set[Variable] = None) -> Trace:
             for x in nbsym.flat_proxy_args:
                 needed_proxies.add(variableify(x))
 
-    dcetrace = from_trace(trace)
     dced_bound_symbols = list(reversed(dced))
     # duplicate number proxies happen with the symbolic shapes and are
     # not covered by the above (due to being in tuples?).
     dced_bound_symbols = remove_duplicate_number_proxies(dced_bound_symbols)
-    dcetrace.bound_symbols = dced_bound_symbols
+
+    return dced_bound_symbols
+
+
+def dce(trace: Trace, needed_proxies: set[Variable] = None) -> Trace:
+    start_time_ns = time.perf_counter_ns()
+
+    bsyms = trace.bound_symbols
+    dced_bsyms = dce_bsyms(bsyms, trace.output, needed_proxies)
+    result = from_trace(trace)
+    result.bound_symbols = dced_bsyms
 
     end_time_ns = time.perf_counter_ns()
     elapsed_time_ns = end_time_ns - start_time_ns
     elapsed_time_millis = elapsed_time_ns // 1000000
-    dcetrace.set_provenance(TraceProvenance(f"Dead Code Elimination (took {elapsed_time_millis} milliseconds)"))
 
-    return dcetrace
+    result.set_provenance(TraceProvenance(f"Dead Code Elimination (took {elapsed_time_millis} milliseconds)"))
+    return result
 
 
 #

@@ -3031,10 +3031,24 @@ def vjp_call(primals, cotangents, trace: Trace, **kwargs):
         primals = (primals,)
 
     result, env = augmented_forward_pass(*primals, trace=trace, **kwargs)
-    check(
-        len(result) == len(cotangents) if isinstance(result, Sequence) else True,
-        lambda: f"Expected cotangents to be a sequence of length {len(result)}, got a sequence of length {len(cotangents)}",
-    )
+
+    if cotangents is None:
+
+        def ones_like(x):
+            if isinstance(x, TensorProxy):
+                return full_like(x, fill_value=1)
+            elif isinstance(x, NumberProxy):
+                return type(x.value)(1)
+            else:
+                return None
+
+        cotangents = tree_map(lambda v: ones_like(v), result)
+    else:
+        check(
+            len(result) == len(cotangents) if isinstance(result, Sequence) else True,
+            lambda: f"Expected cotangents to be a sequence of length {len(result)}, got a sequence of length {len(cotangents)}",
+        )
+
     return result, backward_pass(env, trace, cotangents)
 
 
@@ -3075,18 +3089,8 @@ def value_and_grad(func):
         func (Callable): Function to be differentiated.
     """
 
-    def ones_like(x):
-        if isinstance(x, TensorProxy):
-            return full_like(x, fill_value=1)
-        elif isinstance(x, NumberProxy):
-            return type(x.value)(1)
-        else:
-            return None
-
     def _value_and_grad(*args, **kwargs):
-        trace = construct_trace()(func, *args, **kwargs)
-        cotangents = tree_map(lambda v: ones_like(v), trace.output)
-        return vjp(func)(args, cotangents, **kwargs)
+        return vjp(func)(args, None, **kwargs)
 
     return _value_and_grad
 

@@ -259,20 +259,20 @@ def get_backed_value(s):
     return tuple(map(get_backed_value, vals))
 
 
-def get_proxy_inputs_from_node(node: torch.fx.Node) -> tuple[tuple, dict]:
+def get_proxy_inputs_from_node(node: torch.fx.Node, tracectx) -> tuple[tuple, dict]:
     """Creates proxy inputs from a torch.fx.Node for use with Thunder.
 
     This function generates proxy inputs for a given torch.fx.Node
 
     Args:
         node (torch.fx.Node): The FX graph node to create proxy inputs for.
+        tracectx (TraceCtx): The trace context to use to generate proxy inputs.
     """
     import thunder
-    from thunder.core.trace import TraceCtx
     from thunder.core.proxies import proxy
 
     # We need to be under trace context to generate proxies.
-    with thunder.core.trace.tracectx(TraceCtx()):
+    with thunder.core.trace.tracectx(tracectx):
 
         def make_input_proxy(arg_node):
             # This is a Node in the graph representing a Tensor or tuple of Tensors or
@@ -380,8 +380,10 @@ def _run_with_cache_info():
         cache_info["default_dtype"] = torch.get_default_dtype()
         cache_info["default_device"] = torch.get_default_device()
 
+        tracectx = TraceCtx()
+
         try:
-            proxy_args, proxy_kwargs = get_proxy_inputs_from_node(node)
+            proxy_args, proxy_kwargs = get_proxy_inputs_from_node(node, tracectx)
         except Exception as e:
             return False, SplitReason(
                 SplitReasonType.EXCEPTION_PROXY_THUNDER_OP,
@@ -395,7 +397,7 @@ def _run_with_cache_info():
             else thunder_symbol
         )
         # We need to be under trace context to generate proxies.
-        with thunder.core.trace.tracectx(TraceCtx()):
+        with thunder.core.trace.tracectx(tracectx):
             try:
                 function_to_run(*proxy_args, **proxy_kwargs)
             except Exception as e:
@@ -478,6 +480,7 @@ def is_node_supported_by_thunder(
     """
     Determine whether thunder can execute the operation described by this node.
     """
+    from thunder.core.trace import TraceCtx
     # Docs from the torch.fx.Node - https://pytorch.org/docs/stable/fx.html#torch.fx.Node
     # Each Node has a function specified by its op property
     # Below are the details for the ones this function is interested in -
@@ -555,7 +558,7 @@ def is_node_supported_by_thunder(
     if torchctx.has_method(node.target):
         # `torchctx.get_method` requires args and kwargs to resolve which overload of the method is picked.
         try:
-            args, kwargs = get_proxy_inputs_from_node(node)
+            args, kwargs = get_proxy_inputs_from_node(node, TraceCtx())
         except Exception as e:
             return False, SplitReason(
                 SplitReasonType.EXCEPTION_PROXY_THUNDER_OP,