Lightning-AI
diff --git a/‎thunder/benchmarks/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎thunder/benchmarks/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎thunder/benchmarks/benchmark_litgpt.py‎
Lines changed: 4 additions & 5 deletions b/‎thunder/benchmarks/benchmark_litgpt.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎thunder/core/interpreter.py‎
Lines changed: 7 additions & 0 deletions b/‎thunder/core/interpreter.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎thunder/core/jit_ext.py‎
Lines changed: 0 additions & 1 deletion b/‎thunder/core/jit_ext.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎thunder/core/update_aliases.py‎
Lines changed: 22 additions & 7 deletions b/‎thunder/core/update_aliases.py‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎thunder/dynamo/benchmark_utils.py‎
Lines changed: 7 additions & 1 deletion b/‎thunder/dynamo/benchmark_utils.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎thunder/dynamo/report.py‎
Lines changed: 57 additions & 23 deletions b/‎thunder/dynamo/report.py‎
Lines changed: 57 additions & 23 deletions
diff --git a/‎thunder/numpy/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎thunder/numpy/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎thunder/tests/opinfos.py‎
Lines changed: 1 addition & 1 deletion b/‎thunder/tests/opinfos.py‎
Lines changed: 1 addition & 1 deletion
@@ -2081,7 +2081,7 @@ def fn(self) -> Callable:
         from litgpt.model import CausalSelfAttention
 
         module = (
-            CausalSelfAttention(self.config)
+            CausalSelfAttention(self.config, 0)
             .to(device=self.device, dtype=self.tdtype)
             .requires_grad_(self.requires_grad)
         )
 
@@ -691,7 +691,10 @@ def setup_compile(self, model):
                 executors.insert(0, transformer_engine_ex)
                 transforms.insert(0, TransformerEngineTransform())
 
-            if "dynamo" in self.compile:
+            if "jit" in self.compile:
+                model = thunder.jit(model, executors=executors, transforms=transforms, **jit_options)
+
+            else:
                 if self.distributed_mode == "fsdp2":
                     print("Resetting cache size for when fsdp2 and using thunder as backend torch.compile")
                     import torch._dynamo.config as dynamo_config
@@ -704,10 +707,6 @@ def setup_compile(self, model):
                 # using __wrapped__ to access the original torch.compile function did not work
                 # so we are using the lower level torch._dynamo.optimize function
                 model = torch._dynamo.optimize(backend=self.backend)(model)
-            else:
-                jit_options = {}
-                jit_options["fp8_shard_intermediate_activation"] = self.fp8_shard_intermediate_activation
-                model = thunder.jit(model, executors=executors, transforms=transforms, **jit_options)
         elif self.compile != "eager":
             raise ValueError(f"Invalid compile option: {self.compile}")
 
 
@@ -39,6 +39,8 @@
     TracebackType,
 )
 
+import torch
+
 from thunder.core.baseutils import Singleton, init_colors, extract_callable_name, is_likely_from_collections_namedtuple
 from thunder.core.codeutils import Positions
 
@@ -399,6 +401,11 @@ def __init__(
         if with_provenance_tracking:
             assert isinstance(uncacheable_classes, (list, tuple))
             uncacheable_classes = tuple(set(uncacheable_classes) | {NoneType, int, str, float, bool, complex})
+            if uncacheable_classes is None:
+                uncacheable_classes = ()
+            uncacheable_classes = tuple(
+                set(uncacheable_classes) | {NoneType, int, str, float, bool, complex, torch.Tensor}
+            )
 
         self._uncacheable_classes = uncacheable_classes
 
 
@@ -2199,7 +2199,6 @@ def thunder_general_jit(
         callbacks=general_jit_callbacks,
         with_provenance_tracking=True,
         unwrap_result=False,
-        uncacheable_classes=(torch.Tensor, int, float, str, NoneType),
         record_history=compile_data.debug_options.record_interpreter_history,
     )
 
 
@@ -50,10 +50,8 @@ def _is_view_creation_op(bsym):
     return bsym.sym in ltorch._syms_returning_views or bsym.sym in ltorch._syms_that_may_return_views
 
 
-def _involves_viewed_args(bsym, viewed):
-    if bsym.sym.id == prims.PrimIDs.RETURN:
-        return False
-    return any(isinstance(p, TensorProxy) and variableify(p) in viewed for p in bsym.flat_proxy_args)
+def _involves_viewed_args(in_tensors, viewed):
+    return bool(in_tensors.intersection(viewed))
 
 
 def _can_be_reshaped(arg, arg_to_replace):
@@ -131,6 +129,17 @@ def replace_args_with_alias_map(
     return no_implicit_alias_trace, view_groups
 
 
+def _unswap(swap_map, aliases):
+    reversed_swap_map = {variableify(v): unvariableify(k) for k, v in swap_map.items()}
+
+    def _helper(alias):
+        while (valias := variableify(alias)) in reversed_swap_map:
+            alias = reversed_swap_map[valias]
+        return variableify(alias)
+
+    return list(map(_helper, aliases))
+
+
 def insert_alias_updates(computation_trace: Trace, alias_tensor_indices: list[list[int]]) -> Trace:
     if not any(_is_inplace_op(bsym) for bsym in computation_trace.bound_symbols):
         return computation_trace
@@ -166,15 +175,21 @@ def insert_alias_updates(computation_trace: Trace, alias_tensor_indices: list[li
 
     # Third pass: insert alias updates
     for bsym in computation_trace.bound_symbols:
-        if _is_inplace_op(bsym) or _is_view_creation_op(bsym) or _involves_viewed_args(bsym, viewed):
-            in_tensors = list(map(variableify, filter(lambda p: isinstance(p, TensorProxy), bsym.flat_proxy_args)))
+        in_tensors = list(map(variableify, filter(lambda p: isinstance(p, TensorProxy), bsym.flat_proxy_args)))
+        unswapped_in_tensors = _unswap(swap_map, in_tensors)
+        if (
+            _is_inplace_op(bsym)
+            or _is_view_creation_op(bsym)
+            or (bsym.sym.id != prims.PrimIDs.RETURN and _involves_viewed_args(set(unswapped_in_tensors), viewed))
+        ):
             if _is_inplace_op(bsym) and in_tensors:
                 in_tensors = {in_tensors[0]}
+                unswapped_in_tensors = {unswapped_in_tensors[0]}
             else:
                 in_tensors = set(in_tensors)
             out_tensors = set(map(variableify, filter(lambda p: isinstance(p, TensorProxy), bsym.flat_proxy_outs)))
             encountered.update(in_tensors)
-            group = set(reduce(set.union, filter(lambda g: any(g.intersection(in_tensors)), view_groups), set()))
+            group = set().union(*filter(lambda g: g.intersection(unswapped_in_tensors), view_groups))
             if not group or not (views_encountered := group.intersection(encountered)):
                 # If group is empty, this is a view creation with operands that are not involved in any inplace ops.
                 bsyms.append(bsym.from_bsym_swap_proxies(swap_map, skip_output=True))
 
@@ -150,7 +150,7 @@ def compile(self, fn, *, inputs, **kwargs):
 
     # to_source will always use symbolic trace
     def to_source(self, fn_name):
-        return f"TorchInductorSpecification.torch_inductor({fn_name}, inputs)"
+        return f"TorchInductorSpecification.torch_inductor({fn_name}, inputs, skip_symbolic_trace={self.skip_symbolic_trace})"
 
     def import_str(self):
         return ["import torch", "from thunder.dynamo.benchmark_utils import TorchInductorSpecification"]
@@ -353,6 +353,12 @@ def time(self, stmt="pass", setup="pass", globals=None) -> Measurement:
             Measurement: A benchmarking result containing execution time statistics, see :class:`torch.utils.benchmark.utils.common.Measurement`.
         """
         t = TorchBenchmarkTimer(stmt=stmt, setup=setup, globals=globals, timer=self.inner_timer)
+        # If the timer measures an extremely short execution time, adaptive_autorange may hang.
+        # To prevent this, we perform a preliminary run to check for such cases, e.g. measure kernel time on a cpu-only graph.
+        # If detected, we return the time of a single run, avoiding potential hangs.
+        pre_run = t.timeit(1)
+        if pre_run.median <= 1e-9:
+            return pre_run
         measurement = t.adaptive_autorange(
             threshold=self.threshold, min_run_time=self.min_run_time, max_run_time=self.max_run_time
         )
 
@@ -520,7 +520,7 @@ def write_repro(
         code_str = f"{code_str}\n{main_code.format(graph_name=self.graph_name)}\n{comment_str}"
 
         if file_name is None:
-            file_name = f"{self.graph_name}.py"
+            file_name = f"{self.graph_name}_{compile_fn.name}_repro.py"
         with open(folder / file_name, "w") as f:
             print(code_str, file=f)
         format_python_file(folder / file_name)
@@ -633,7 +633,7 @@ def write_benchmark(
 
         code_str = f"{code_str}\n{main_code.format(graph_name=self.graph_name)}\n{comment_str}"
         if file_name is None:
-            file_name = f"{self.graph_name}.py"
+            file_name = f"{self.graph_name}_{compile_fn.name}_{time_fn.name}_benchmark.py"
         with open(folder / file_name, "w") as f:
             print(code_str, file=f)
         format_python_file(folder / file_name)
@@ -924,7 +924,7 @@ def write_nvfuser_benchmark(self, folder, time_fn: TimerInterface, file_name=Non
 {comment_str}
 """
         if file_name is None:
-            file_name = f"{self.name}_benchmark_nvfuser.py"
+            file_name = f"{self.name}_benchmark_nvfuser_{time_fn.name}.py"
         with open(folder / file_name, "w") as f:
             print(code_str, file=f)
         format_python_file(folder / file_name)
@@ -983,7 +983,7 @@ def write_inductor_benchmark(self, folder: PathLike, time_fn: TimerInterface, fi
 print(measurement)
 """
         if file_name is None:
-            file_name = f"{self.name}_benchmark_inductor.py"
+            file_name = f"{self.name}_benchmark_inductor_{time_fn.name}.py"
         with open(folder / file_name, "w") as f:
             f.write(code_str)
         format_python_file(folder / file_name)
@@ -1428,22 +1428,39 @@ def save_thunderfx_repros(
     Saves reproduction scripts for ThunderFX subgraphs.
 
     This function:
-    1. Creates a folder structure to organize the repros
-    .
-    └── graph0
-        ├── fusion_reports
-        │   ├── graph0_thunder_0_nvFusion0_forward_repro_nvfuser.py
-        │   ├── graph0_thunder_0_nvFusion1_forward_repro_nvfuser.py
-        │   ├── graph0_thunder_0_nvFusion2_backward_repro_nvfuser.py
-        ├── graph0_thunder_0_bwd_trace.py
-        ├── graph0_thunder_0_fwd_trace.py
-        └── graph0_thunder_0.py
+    1. Creates a folder structure to organize the repro or benchmark scripts:
+
+       If use_benchmark is True:
+       graph0/
+       ├── fusion_reports/
+       │   ├── graph0_thunder_0_nvFusion0_forward_benchmark_inductor_KernelTime.py
+       │   ├── graph0_thunder_0_nvFusion0_forward_benchmark_inductor_WallTimeWithMemoryUsage.py
+       │   ├── graph0_thunder_0_nvFusion0_forward_benchmark_nvfuser_KernelTime.py
+       │   └── graph0_thunder_0_nvFusion0_forward_benchmark_nvfuser_WallTimeWithMemoryUsage.py
+       ├── graph0_repro_torchcompile.py
+       ├── graph0_thunder_0_bwd_trace.py
+       ├── graph0_thunder_0_fwd_trace.py
+       ├── graph0_thunder_0_inductor_KernelTime_benchmark.py
+       ├── graph0_thunder_0_inductor_WallTimeWithMemoryUsage_benchmark.py
+       ├── graph0_thunder_0_thunder_KernelTime_benchmark.py
+       └── graph0_thunder_0_thunder_WallTimeWithMemoryUsage_benchmark.py
+
+       If use_benchmark is False:
+       graph0/
+       ├── fusion_reports/
+       │   ├── graph0_thunder_0_nvFusion0_forward_repro_inductor.py
+       │   └── graph0_thunder_0_nvFusion0_forward_repro_nvfuser.py
+       ├── graph0_repro_torchcompile.py
+       ├── graph0_thunder_0_fwd_trace.py
+       ├── graph0_thunder_0_bwd_trace.py
+       ├── graph0_thunder_0_inductor_repro.py
+       └── graph0_thunder_0_thunder_repro.py
 
     2. For each Thunder FX graph and its subgraphs:
-        - Checks runnability if requested
-        - Saves benchmark or repro scripts
-        - Saves trace information if requested
-        - Saves nvFusion repros if requested
+       - Checks runnability if requested
+       - Saves benchmark or repro scripts
+       - Saves trace information if requested
+       - Saves nvFusion repros if requested
 
     Args:
         fn: The callable to analyze
@@ -1452,7 +1469,7 @@ def save_thunderfx_repros(
         check_runnability: If True, checks if graphs can run with Thunder
         save_fusion: If True, saves nvFusion repros
         save_trace: If True, saves trace information
-        stream: Stream to write output log informationto
+        stream: Stream to write output log information to
         force_overwrite: If True, overwrites existing folder at folder_path
         **compile_kwargs: Keyword arguments for Thunder and torch.compile
 
@@ -1472,6 +1489,7 @@ def inner_fn(*args, **kwargs):
         for thunder_fxgraph_report in thunder_fxgraph_reports:
             graph_folder = folder_path / thunder_fxgraph_report.graph_name
             graph_folder.mkdir(exist_ok=True, parents=True)
+            thunder_fxgraph_report.write_inductor_repro(graph_folder)
             for split_report in thunder_fxgraph_report.subgraph_reports:
                 if check_runnability or save_trace or save_fusion:
                     try:
@@ -1484,22 +1502,38 @@ def inner_fn(*args, **kwargs):
                         continue
                     else:
                         stream.write(f"Successfully ran the {split_report.graph_name} using Thunder\n")
+
+                from torch._inductor.compile_fx import graph_returns_tuple
+
+                # torch._inductor.compile requires the output to be tuple, if not, the symbolic trace is necessary
+                skip_symbolic_trace = graph_returns_tuple(split_report.graph)
+                torchinductor = TorchInductorSpecification(skip_symbolic_trace=skip_symbolic_trace)
                 if use_benchmark:
-                    split_report.write_benchmark(graph_folder, thunderjit, WallTime)
+                    split_report.write_benchmark(graph_folder, thunderjit, WallTimeWithMemoryUsage)
+                    split_report.write_benchmark(graph_folder, thunderjit, KernelTime)
+
+                    split_report.write_benchmark(graph_folder, torchinductor, WallTimeWithMemoryUsage)
+                    split_report.write_benchmark(graph_folder, torchinductor, KernelTime)
                 else:
                     split_report.write_repro(graph_folder, thunderjit)
+                    split_report.write_repro(graph_folder, torchinductor)
                 if save_trace:
                     with open(graph_folder / f"{split_report.graph_name}_fwd_trace.py", "w") as f:
                         f.write(str(split_report.fwd_trc))
-                    with open(graph_folder / f"{split_report.graph_name}_bwd_trace.py", "w") as f:
-                        f.write(str(split_report.bwd_trc))
+                    if split_report.bwd_trc is not None:
+                        with open(graph_folder / f"{split_report.graph_name}_bwd_trace.py", "w") as f:
+                            f.write(str(split_report.bwd_trc))
                 if save_fusion:
                     fusion_folder = graph_folder / "fusion_reports"
                     fusion_folder.mkdir(exist_ok=True, parents=True)
                     for fusion_report in split_report.fusion_reports:
                         if use_benchmark:
-                            fusion_report.write_nvfuser_benchmark(fusion_folder, WallTime)
+                            fusion_report.write_nvfuser_benchmark(fusion_folder, WallTimeWithMemoryUsage)
+                            fusion_report.write_inductor_benchmark(fusion_folder, WallTimeWithMemoryUsage)
+                            fusion_report.write_nvfuser_benchmark(fusion_folder, KernelTime)
+                            fusion_report.write_inductor_benchmark(fusion_folder, KernelTime)
                         else:
                             fusion_report.write_nvfuser_repro(fusion_folder)
+                            fusion_report.write_inductor_repro(fusion_folder)
 
     return inner_fn
@@ -1,7 +1,7 @@
 from numbers import Number
 from collections.abc import Callable
 
-from thunder.core.langctx import langctx, Languages
+from thunder.core.langctxs import langctx, Languages
 from thunder.numpy.langctx import register_method
 
 from thunder.core.proxies import TensorProxy
 
@@ -6687,7 +6687,7 @@ def arange_sample_generator(op, device, dtype, requires_grad, **kwargs):
     )
 
     for case in partial_cases:
-        yield SampleInput(*case)
+        yield SampleInput(*case, dtype=dtype, device=device)
 
 
 arange_opinfo = OpInfo(
Original file line number	Diff line number	Diff line change
`@@ -2081,7 +2081,7 @@ def fn(self) -> Callable:`
`2081`	`2081`	`from litgpt.model import CausalSelfAttention`
`2082`	`2082`
`2083`	`2083`	`module = (`
`2084`		`- CausalSelfAttention(self.config)`
	`2084`	`+ CausalSelfAttention(self.config, 0)`
`2085`	`2085`	`.to(device=self.device, dtype=self.tdtype)`
`2086`	`2086`	`.requires_grad_(self.requires_grad)`
`2087`	`2087`	`)`
Original file line number	Diff line number	Diff line change
`@@ -2199,7 +2199,6 @@ def thunder_general_jit(`
`2199`	`2199`	`callbacks=general_jit_callbacks,`
`2200`	`2200`	`with_provenance_tracking=True,`
`2201`	`2201`	`unwrap_result=False,`
`2202`		`- uncacheable_classes=(torch.Tensor, int, float, str, NoneType),`
`2203`	`2202`	`record_history=compile_data.debug_options.record_interpreter_history,`
`2204`	`2203`	`)`
`2205`	`2204`
Original file line number	Diff line number	Diff line change
`@@ -6687,7 +6687,7 @@ def arange_sample_generator(op, device, dtype, requires_grad, **kwargs):`
`6687`	`6687`	`)`
`6688`	`6688`
`6689`	`6689`	`for case in partial_cases:`
`6690`		`- yield SampleInput(*case)`
	`6690`	`+ yield SampleInput(*case, dtype=dtype, device=device)`
`6691`	`6691`
`6692`	`6692`
`6693`	`6693`	`arange_opinfo = OpInfo(`