From 855380cedbb7d25c2cc65187d3d0a7c017110acb Mon Sep 17 00:00:00 2001
From: Jonas Rembser <jonas.rembser@cern.ch>
Date: Thu, 12 Mar 2026 09:20:41 +0100
Subject: [PATCH 1/2] Revert "[tmva][sofie] Fix NonZero to define max output
 shape values in Session ctor"

This reverts commit 1f747b0bafd3a05e54217771a29a2cdb6b91da0f.

The reason for the revert is that it's actually useful to have the
maximum dynamic tensor size as a datamember of the Session, because then
we can refactor the generated code such that it can be differentiated
with Clad.
---
 tmva/sofie/inc/TMVA/ROperator.hxx         | 4 +---
 tmva/sofie/inc/TMVA/ROperator_NonZero.hxx | 8 ++++----
 tmva/sofie/src/RModel.cxx                 | 5 -----
 3 files changed, 5 insertions(+), 12 deletions(-)
diff --git a/tmva/sofie/inc/TMVA/ROperator.hxx b/tmva/sofie/inc/TMVA/ROperator.hxx
index 6fac7958f8f9d..f0afd9c4374c1 100644
--- a/tmva/sofie/inc/TMVA/ROperator.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator.hxx
@@ -25,9 +25,7 @@ public:
    virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) { return {}; };
    virtual void Initialize(RModel&) = 0;
    virtual std::string Generate(std::string OpName) = 0;  //expect unique opName for each operator within the same RModel
-   // generate code for Session constructor before tensor allocation
-   virtual std::string GenerateSessionCtorCode() { return "";}
-   // generate initialization code for session constructor after tensor allocations
+   // generate initialization code for session constructor
    virtual std::string GenerateInitCode() { return "";}
    // generate some specific declaration code for Session
    virtual std::string GenerateDeclCode() { return "";}
diff --git a/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx b/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
index fdf04665e0315..1fcb9cb45e74d 100644
--- a/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
@@ -92,8 +92,8 @@ public:
          fShapeY.resize(2);
          fShapeY[0] = fShapeX.size();
 
-         // flag -1 to define the shape variable in the constructor code and not in the constructor signature
-         fShapeY[1] = Dim{std::string("v_NonZero_") + fNX, static_cast<size_t>(-1) };
+         // identify as -1 since we will declare maximum as size of input
+         fShapeY[1] = Dim{std::string("v_NonZero_") + fNX, static_cast<size_t>(-1)};
 
          model.AddIntermediateTensor(fNY, ETensorType::INT64, fShapeY);
          if (model.Verbose()) {
@@ -101,7 +101,7 @@ public:
          }
       }
    }
-   std::string GenerateSessionCtorCode() override {
+   std::string GenerateSessionMembersCode(std::string /*opName*/) override {
       if (fIsOutputConstant) return "";
       // define output value used as max non zero with max size = input shape * N
       auto inputLength = ConvertDimShapeToLength(fShapeX);
@@ -133,7 +133,7 @@ public:
 
       // loop on input indices
       out << SP << "size_t offset_" << opName << " = 0;\n";
-      out << SP << "size_t " << vnonzero << " = 0;\n";
+      out << SP << vnonzero << " = 0;\n";
       for (size_t j = 0; j < dims; j++) {
          std::string index = "i_" + std::to_string(j);
          for (size_t k = 0; k <= j; k++) out << SP;
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 0e6a7d7cf9bbf..e89e8513cf783 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -1152,11 +1152,6 @@ void RModel::GenerateSessionCode()
       }
       fGC += ") {\n";
 
-      // add some code required in session constructor
-      for (size_t id = 0; id < fOperators.size(); id++) {
-         fGC += fOperators[id]->GenerateSessionCtorCode();
-      }
-
       // initializing dynamic parameters
       if (!fDimShapeNames.empty()) {
          fGC += "\n\n";

From ac44395a6b2f756a88b59f9896f8322051d33617 Mon Sep 17 00:00:00 2001
From: Jonas Rembser <jonas.rembser@cern.ch>
Date: Mon, 7 Apr 2025 13:42:12 +0200
Subject: [PATCH 2/2] [tmva][sofie] Restructure emitted code to be
 differentiable with Clad

The idea of this commit is to refactor the `doInfer()` function that
implements the inference from a member function of the `Session` struct
to a free function that takes the `Session` by `const`-reference.

This free function should only use the session struct and bare C-style
arrays, so that Clad will have no problem differentiating it.

A unit test for the differentiation of a simple MLP is implemented,
embedded in the existing SOFIE tests.
---
 tmva/sofie/inc/TMVA/RModel.hxx            |   4 +-
 tmva/sofie/inc/TMVA/ROperator_Gemm.hxx    |   7 +-
 tmva/sofie/inc/TMVA/ROperator_LSTM.hxx    | 121 +++++-----
 tmva/sofie/inc/TMVA/ROperator_NonZero.hxx |   6 +-
 tmva/sofie/inc/TMVA/ROperator_RNN.hxx     |  46 ++--
 tmva/sofie/inc/TMVA/SOFIE_common.hxx      |  10 -
 tmva/sofie/src/RModel.cxx                 | 271 ++++++++++++++++++----
 tmva/sofie/test/CMakeLists.txt            |  14 +-
 tmva/sofie/test/TestCladAutodiff.cxx      | 111 +++++++++
 9 files changed, 453 insertions(+), 137 deletions(-)
 create mode 100644 tmva/sofie/test/TestCladAutodiff.cxx

diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx
index bc6493090f74e..3395de6d34166 100644
--- a/tmva/sofie/inc/TMVA/RModel.hxx
+++ b/tmva/sofie/inc/TMVA/RModel.hxx
@@ -35,8 +35,6 @@ private:
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames; // input tensor names using ONNX order
 
-
-
    std::vector<std::unique_ptr<ROperator>> fOperators;
 
    std::vector<std::shared_ptr<RModel>> fSubGraphs;    ///<!  sub-graph models (transient)
@@ -196,6 +194,8 @@ protected:
    void GenerateIntermediateMemoryPool();
    // Generate all session code
    void GenerateSessionCode();
+   bool IsInputTensorShapeParam(std::string const &name) const;
+   std::vector<std::string> CollectTensorMemberNames(const std::string &input);
 
 public:
    const std::vector<std::string> & GetInputTensorNames() const { return fInputTensorNames; }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index ecdd0b435fe37..83381baa39f0c 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -393,9 +393,12 @@ namespace SOFIE{
              << (fAttrTransB ? "true, " : "false, ")
              << (fAttrTransA ? "true, " : "false, ")
              << n << ", " << m << ", " << k << ", ";
-            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", tensor_" << fNB;
+            // TODO: the cast to (float *) is not needed here from the C++ language perspective (the arguments to
+            // Gemm_Call are const already), but Clad bug https://github.com/vgvassilev/clad/issues/1721 is requiring
+            // us to do this cast to keep Clad working. Remove this hack once the Clad issue is fixed.
+            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", (float*)tensor_" << fNB;
             if (extraB) out << " + " << opName << "_B_offset";
-            out << ", tensor_" << fNA;
+            out << ", (float*)tensor_" << fNA; // TODO: same here
             if (extraA) out << " + " << opName << "_A_offset";
             out << ", " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
             // in the case of bias and no broadcasting needed
diff --git a/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx b/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx
index 84f37bc57da7e..ae0ee70c4eeea 100644
--- a/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx
@@ -390,45 +390,62 @@ std::string ROperator_LSTM<T>::GenerateSessionMembersCode(std::string opName)
    size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0];
    size_t input_size = fShapeX[2];
 
+   struct Block {
+      std::string name;
+      size_t size;
+   };
+
+   std::vector<Block> blocks;
+
+   size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
+   size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
+
+   // Layout-dependent buffers
    if (fAttrLayout != 0) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">("
-          << seq_length * batch_size * input_size << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
+      blocks.push_back({"input", seq_length * batch_size * input_size});
+      blocks.push_back({"initial_hidden_state", num_directions * batch_size * fAttrHiddenSize});
+      blocks.push_back({"initial_cell_state", num_directions * batch_size * fAttrHiddenSize});
    }
-   // Set the feedforward
-   size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_ff_input_gate = std::vector<" << fType << ">(" << ff_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_ff_output_gate = std::vector<" << fType << ">(" << ff_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_ff_cell_gate = std::vector<" << fType << ">(" << ff_size
-       << ");\n";
+
+   // Feedforward gates
+   blocks.push_back({"ff_input_gate", ff_size});
+   blocks.push_back({"ff_output_gate", ff_size});
+   blocks.push_back({"ff_cell_gate", ff_size});
    if (fAttrInputForget == 0)
-      out << "std::vector<" << fType << "> fVec_" << opName << "_ff_forget_gate = std::vector<" << fType << ">("
-          << ff_size << ");\n";
-   // gate results
-   size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_input_gate = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_output_gate = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_cell_gate = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
+      blocks.push_back({"ff_forget_gate", ff_size});
+
+   // Gate outputs
+   blocks.push_back({"input_gate", hs_size});
+   blocks.push_back({"output_gate", hs_size});
+   blocks.push_back({"cell_gate", hs_size});
    if (fAttrInputForget == 0)
-      out << "std::vector<" << fType << "> fVec_" << opName << "_forget_gate = std::vector<" << fType << ">(" << hs_size
-          << ");\n";
-   // cell state
-   out << "std::vector<" << fType << "> fVec_" << opName << "_cell_state = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_new_cell_state = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   // hiddden state
+      blocks.push_back({"forget_gate", hs_size});
+
+   // Cell state
+   blocks.push_back({"cell_state", hs_size});
+   blocks.push_back({"new_cell_state", hs_size});
+
+   // Hidden state (conditional)
    if (fAttrLayout != 0 || fNY.empty()) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">("
-          << hs_size << ");\n";
+      blocks.push_back({"hidden_state", hs_size});
+   }
+
+   // Compute total size
+   size_t total_size = 0;
+   for (const auto &b : blocks) {
+      total_size += b.size;
+   }
+
+   // Backing storage
+   out << "std::vector<" << fType << "> fVec_" << opName << "_buffer = std::vector<" << fType << ">(" << total_size
+       << ");\n";
+
+   // Emit pointers
+   std::size_t offset = 0;
+   for (const auto &b : blocks) {
+      out << fType << "* fVec_" << opName << "_" << b.name << " = fVec_" << opName << "_buffer.data() + " << offset
+          << ";\n";
+      offset += b.size;
    }
 
    out << "\n";
@@ -452,7 +469,7 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
       out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession)
-         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input.data();\n";
+         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input;\n";
       else
          out << SP << fType << "  " << OpName << "_input[" << seq_length * batch_size * input_size << "] = {0};\n";
 
@@ -470,11 +487,11 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the initial hidden state
    if (!fNInitial_h.empty()) {
       if (fAttrLayout == 0) {
-         out << SP << fType << " *" << OpName << "_initial_hidden_state = " << " tensor_" << fNInitial_h << ";\n";
+         out << SP << fType << " const*" << OpName << "_initial_hidden_state = " << " tensor_" << fNInitial_h << ";\n";
       } else {
          if (fUseSession)
-            out << SP << fType << " * " << OpName << "_initial_hidden_state = this->fVec_" << OpName
-                << "_initial_hidden_state.data();\n";
+            out << SP << fType << " const* " << OpName << "_initial_hidden_state = this->fVec_" << OpName
+                << "_initial_hidden_state;\n";
          else
             out << SP << fType << "  " << OpName << "_initial_hidden_state["
                 << num_directions * batch_size * fAttrHiddenSize << "] = {0};\n";
@@ -494,11 +511,11 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the initial cell state
    if (!fNInitial_c.empty()) {
       if (fAttrLayout == 0) {
-         out << SP << fType << " *" << OpName << "_initial_cell_state = " << " tensor_" << fNInitial_c << ";\n";
+         out << SP << fType << " const*" << OpName << "_initial_cell_state = " << " tensor_" << fNInitial_c << ";\n";
       } else {
          if (fUseSession)
-            out << SP << fType << " * " << OpName << "_initial_cell_state = this->fVec_" << OpName
-                << "_initial_cell_state.data();\n";
+            out << SP << fType << " const* " << OpName << "_initial_cell_state = this->fVec_" << OpName
+                << "_initial_cell_state;\n";
          else
             out << SP << fType << "  " << OpName << "_initial_cell_state["
                 << num_directions * batch_size * fAttrHiddenSize << "] = {0};\n";
@@ -518,12 +535,12 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the feedforward
    size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_ff_input_gate = this->fVec_" << OpName << "_ff_input_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_ff_output_gate = this->fVec_" << OpName << "_ff_output_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_ff_cell_gate = this->fVec_" << OpName << "_ff_cell_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_ff_input_gate = this->fVec_" << OpName << "_ff_input_gate;\n";
+      out << SP << fType << " * " << OpName << "_ff_output_gate = this->fVec_" << OpName << "_ff_output_gate;\n";
+      out << SP << fType << " * " << OpName << "_ff_cell_gate = this->fVec_" << OpName << "_ff_cell_gate;\n";
       if (fAttrInputForget == 0) {
          out << SP << fType << " * " << OpName << "_ff_forget_gate = this->fVec_" << OpName
-             << "_ff_forget_gate.data();\n";
+             << "_ff_forget_gate;\n";
       }
    } else {
       out << SP << fType << "  " << OpName << "_ff_input_gate[" << ff_size << "] = {0};\n";
@@ -536,11 +553,11 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the gates
    size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_input_gate = this->fVec_" << OpName << "_input_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_output_gate = this->fVec_" << OpName << "_output_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_cell_gate = this->fVec_" << OpName << "_cell_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_input_gate = this->fVec_" << OpName << "_input_gate;\n";
+      out << SP << fType << " * " << OpName << "_output_gate = this->fVec_" << OpName << "_output_gate;\n";
+      out << SP << fType << " * " << OpName << "_cell_gate = this->fVec_" << OpName << "_cell_gate;\n";
       if (fAttrInputForget == 0) {
-         out << SP << fType << " * " << OpName << "_forget_gate = this->fVec_" << OpName << "_forget_gate.data();\n";
+         out << SP << fType << " * " << OpName << "_forget_gate = this->fVec_" << OpName << "_forget_gate;\n";
       }
    } else {
       out << SP << fType << "  " << OpName << "_input_gate[" << hidden_state_size << "] = {0};\n";
@@ -552,8 +569,8 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    }
    // Set the cell state and the new cell state = h(cell state)
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_cell_state = this->fVec_" << OpName << "_cell_state.data();\n";
-      out << SP << fType << " * " << OpName << "_new_cell_state = this->fVec_" << OpName << "_new_cell_state.data();\n";
+      out << SP << fType << " * " << OpName << "_cell_state = this->fVec_" << OpName << "_cell_state;\n";
+      out << SP << fType << " * " << OpName << "_new_cell_state = this->fVec_" << OpName << "_new_cell_state;\n";
    } else {
       out << SP << fType << "  " << OpName << "_cell_state[" << hidden_state_size << "] = {0};\n";
       out << SP << fType << "  " << OpName << "_new_cell_state[" << hidden_state_size << "] = {0};\n";
@@ -564,7 +581,7 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
       out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state.data();\n";
+         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state;\n";
       } else {
          out << SP << fType << "  " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n";
       }
diff --git a/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx b/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
index 1fcb9cb45e74d..8587035f8d44b 100644
--- a/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
@@ -101,16 +101,16 @@ public:
          }
       }
    }
+
    std::string GenerateSessionMembersCode(std::string /*opName*/) override {
       if (fIsOutputConstant) return "";
       // define output value used as max non zero with max size = input shape * N
       auto inputLength = ConvertDimShapeToLength(fShapeX);
       std::stringstream out;
-      out << SP << "size_t v_NonZero_" << fNX << " = " << inputLength << ";\n";
+      out << SP << "size_t fV_NonZero_" << fNX << " = " << inputLength << ";\n";
       return out.str();
    }
 
-
    std::string Generate(std::string opName) override {
       if (fIsOutputConstant) {
          return "";
@@ -133,7 +133,7 @@ public:
 
       // loop on input indices
       out << SP << "size_t offset_" << opName << " = 0;\n";
-      out << SP << vnonzero << " = 0;\n";
+      out << SP << "size_t " << vnonzero << " = 0;\n";
       for (size_t j = 0; j < dims; j++) {
          std::string index = "i_" + std::to_string(j);
          for (size_t k = 0; k <= j; k++) out << SP;
diff --git a/tmva/sofie/inc/TMVA/ROperator_RNN.hxx b/tmva/sofie/inc/TMVA/ROperator_RNN.hxx
index 0667047eed228..f385a502d4077 100644
--- a/tmva/sofie/inc/TMVA/ROperator_RNN.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_RNN.hxx
@@ -308,18 +308,38 @@ std::string ROperator_RNN<T>::GenerateSessionMembersCode(std::string opName)
    size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0];
    size_t input_size = fShapeX[2];
 
+   struct Block {
+      std::string name;
+      size_t size;
+   };
+
+   std::vector<Block> blocks;
+
    if (fAttrLayout != 0) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">("
-          << seq_length * batch_size * input_size << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
+      blocks.push_back({"input", seq_length * batch_size * input_size});
+      blocks.push_back({"initial_hidden_state", num_directions * batch_size * fAttrHiddenSize});
    }
-   out << "std::vector<" << fType << "> fVec_" << opName << "_feedforward = std::vector<" << fType << ">("
-       << seq_length * batch_size * fAttrHiddenSize << ");\n";
-
+   blocks.push_back({"feedforward", seq_length * batch_size * fAttrHiddenSize});
    if (fAttrLayout != 0 || fNY.empty()) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">("
-          << seq_length * num_directions * batch_size * fAttrHiddenSize << ");\n";
+      blocks.push_back({"hidden_state", seq_length * num_directions * batch_size * fAttrHiddenSize});
+   }
+
+   // Compute total size
+   size_t total_size = 0;
+   for (const auto &b : blocks) {
+      total_size += b.size;
+   }
+
+   // Emit backing storage
+   out << "std::vector<" << fType << "> fVec_" << opName << "_buffer = std::vector<" << fType << ">(" << total_size
+       << ");\n";
+
+   // Emit pointers
+   std::size_t offset = 0;
+   for (const auto &b : blocks) {
+      out << fType << "* fVec_" << opName << "_" << b.name << " = fVec_" << opName << "_buffer.data() + " << offset
+          << ";\n";
+      offset += b.size;
    }
 
    out << "\n";
@@ -346,7 +366,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
       }
    } else {
       if (fUseSession)
-         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input.data();\n";
+         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input;\n";
       else
          out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n";
       out << SP << "for(size_t seq = 0; seq < " << seq_length << "; seq++) {\n";
@@ -367,7 +387,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
       } else {
          if (fUseSession)
             out << SP << fType << " * " << OpName << "_initial_hidden_state = this->fVec_" << OpName
-                << "_initial_hidden_state.data();\n";
+                << "_initial_hidden_state;\n";
          else
             out << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size * fAttrHiddenSize
                 << "] = {0};\n";
@@ -385,7 +405,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
    }
 
    if (fUseSession)
-      out << SP << fType << " * " << OpName << "_feedforward = this->fVec_" << OpName << "_feedforward.data();\n";
+      out << SP << fType << " * " << OpName << "_feedforward = this->fVec_" << OpName << "_feedforward;\n";
    else
       out << SP << fType << " " << OpName << "_feedforward[" << seq_length * batch_size * fAttrHiddenSize
           << "] = {0};\n";
@@ -395,7 +415,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
       out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n";
    } else {
       if (fUseSession)
-         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state.data();\n";
+         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state;\n";
       else
          out << SP << fType << " " << OpName << "_hidden_state["
              << seq_length * num_directions * batch_size * fAttrHiddenSize << "] = {0};\n";
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
index fddc07a85fc08..8769193080b39 100644
--- a/tmva/sofie/inc/TMVA/SOFIE_common.hxx
+++ b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -681,16 +681,6 @@ void col2im(const Dtype* data_col, const int channels,
   //std::cout << "finishing col2imp" << std::endl;
 }
 
-// Used at the end of infer() to fill the return object.
-template <class T>
-void FillOutput(T const *arr, std::vector<T> &out, std::size_t n)
-{
-   out.resize(n);
-   for (std::size_t i = 0; i < n; ++i) {
-      out[i] = arr[i];
-   }
-}
-
 }  // end namespace UTILITY
 
 namespace BLAS{
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index e89e8513cf783..74e672779fcf5 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -11,14 +11,34 @@
 #include "TMVA/RModel.hxx"
 #include "TMVA/SOFIE_common.hxx"
 
-namespace TMVA {
-namespace Experimental {
-namespace SOFIE {
+namespace TMVA::Experimental::SOFIE {
 
 namespace {
+
 const std::string SP = "   ";
+
+void ReplaceAll(std::string &str, const std::string &from, const std::string &to)
+{
+   size_t pos = 0;
+   while ((pos = str.find(from, pos)) != std::string::npos) {
+      str.replace(pos, from.length(), to);
+      pos += to.length();
+   }
+}
+
+bool IsIdentifierChar(char c)
+{
+   return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
 }
 
+// Get the data member name corresponding to a tensor with a given name.
+std::string TensorMember(std::string const &name)
+{
+   return "tensor_" + name;
+}
+
+} // namespace
+
 std::underlying_type_t<Options> operator|(Options opA, Options opB) {
     return static_cast<std::underlying_type_t<Options>>(opA) | static_cast<std::underlying_type_t<Options>>(opB);
 }
@@ -26,6 +46,7 @@ std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, O
     return opA | static_cast<std::underlying_type_t<Options>>(opB);
 }
 
+
 std::vector<size_t> RModel::GetTensorShape(const std::string & name) const {
     auto f = fReadyInputTensorInfos.find(name);
     if (f != fReadyInputTensorInfos.end()) {
@@ -356,7 +377,7 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
       std::string typeName = ConvertTypeToString(GetTensorType(name));
       code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes";
       code << "\n"
-           << typeName << "* tensor_" << name << " = reinterpret_cast<" << typeName
+           << typeName << "* " << TensorMember(name) << " = reinterpret_cast<" << typeName
            << "*>(fIntermediateMemoryPool.data() + " << location << ");\n";
    };
 
@@ -714,7 +735,8 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
       } while (sameData && idx < length);
    }
    if (allocateOnStack) {
-      strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      strs << type << " fTensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ";\n";
    } else {
       strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
       if (sameData)
@@ -722,7 +744,7 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
       else {
          strs << ConvertValuesToString(length, data) << ";\n";
       }
-      strs << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
+      strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ".data();\n";
    }
    return strs.str();
 }
@@ -735,22 +757,22 @@ void RModel::GenerateInitializedTensorInfo()
    // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors )
    for (auto &i : fInitializedTensors) {
       if (i.second.IsNotWritable())  continue;
+      size_t length = ConvertShapeToLength(i.second.shape());
       if (!fUseWeightFile || i.second.IsConstantTensor() || !i.second.IsWeightTensor() ) {
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += GenerateConstantTensorCode<float>(i);
-            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
+            fConstantTensorSize += length * sizeof(float);
          } else if (i.second.type() == ETensorType::INT64) {
             fGC += GenerateConstantTensorCode<int64_t>(i);
-            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 8;
+            fConstantTensorSize += length * sizeof(int64_t);
          }
 
       } else {
          // case of tensors which are read from a file
-         size_t length = ConvertShapeToLength(i.second.shape());
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
-            fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
-            fWeightsTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
+            fGC += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
+            fWeightsTensorSize += length * sizeof(float);
          }
       }
    }
@@ -774,7 +796,7 @@ void RModel::GenerateIntermediateTensorInfo() {
          bool  is_alias = (IsAliasTensor(i.first));
          if (i.second.type == ETensorType::BOOL && !is_alias) {
                tensor_declaration_block += "std::vector<std::uint8_t> fTensor_" + i.first + " = std::vector<std::uint8_t>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
-               tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "std::uint8_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                continue;
          }
          bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended);
@@ -788,22 +810,22 @@ void RModel::GenerateIntermediateTensorInfo() {
 
             if (i.second.type == ETensorType::FLOAT) {
                tensor_declaration_block += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                fOtherTensorSize += 4 * length;
             }
             else if (i.second.type == ETensorType::DOUBLE) {
                tensor_declaration_block += "std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "double * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                fOtherTensorSize += 8 * length;
             }
             else if (i.second.type == ETensorType::INT64) {
                tensor_declaration_block += "std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "int64_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                fOtherTensorSize += 8 * length;
             }
          }
          if (is_alias) {
-             tensor_declaration_block += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
+             tensor_declaration_block += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n";
          }
 
       }
@@ -816,7 +838,7 @@ void RModel::GenerateIntermediateTensorInfo() {
    if (!fDynamicTensorInfos.empty()) {
       fGC += "//--- declare the dynamic tensors\n";
       for (auto &i : fDynamicTensorInfos) {
-         fGC += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
+         fGC += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n";
       }
       fGC += "//--- dynamic tensors pool\n";
       fGC += "std::vector<char> fDynamicMemoryPool;\n";
@@ -862,9 +884,10 @@ void RModel::GenerateDynamicTensorInfo()
             auto op_ptr = op.get();
             std::cout << "Looping on operator " << op_index << "   " << typeid(*op_ptr).name() << std::endl;
          }
-         // check if is a dynamic tensor and not an alias tensor
+         // check if is a dynamic tensor and not an alias tensor or output tensor
          std::string name = std::string(it);
-         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)) {
+         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)
+              && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end()) {
             auto tensor_size =  ConvertDimShapeToLength(GetDimTensorShape(name));
             auto type = GetTensorType(name);
             size_t type_size = GetTypeSize(type);
@@ -901,6 +924,7 @@ void RModel::GenerateDynamicTensorInfo()
    bool missingTensor = false;
    for (auto &i : fDynamicTensorInfos) {
       if (IsAliasTensor(i.first)) continue;
+      if (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) != fOutputTensorNames.end()) continue;
       if (std::find(tensors.begin(), tensors.end(), std::pair<std::string,ETensorType>{i.first, i.second.type}) == tensors.end()) {
          std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl;
          missingTensor = true;
@@ -912,6 +936,83 @@ void RModel::GenerateDynamicTensorInfo()
    fGC += out.str();
 }
 
+/// Check if a given parameter is used for the shape of an input tensor.
+bool RModel::IsInputTensorShapeParam(std::string const &paramName) const
+{
+   for (auto &name : fInputTensorNames) {
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            if (d.param == paramName)
+               return true;
+         }
+      }
+   }
+   return false;
+}
+
+/// Collects all identifiers starting with "tensor_" in the input code,
+/// provided that the occurrence is not immediately preceded by a
+/// character that is valid in a C++ identifier. Excludes input and output tensor names.
+/// Returns a deduplicated std::vector<std::string>.
+std::vector<std::string> RModel::CollectTensorMemberNames(const std::string &input)
+{
+   const std::string target = "tensor_";
+
+   std::vector<std::string> result;
+
+   for (size_t i = 0; i < input.size();) {
+
+      bool doCollect = false;
+
+      if (i + target.size() <= input.size() && input.compare(i, target.size(), target) == 0 &&
+          (i == 0 || !IsIdentifierChar(input[i - 1]))) {
+
+         doCollect = true;
+
+         std::size_t j = i + target.size();
+
+         // Extend to full identifier
+         while (j < input.size() && IsIdentifierChar(input[j]))
+            ++j;
+
+         std::string fullName = input.substr(i, j - i);
+
+         // Exclude input tensor names
+         for (std::string const &name : fInputTensorNames) {
+            if (fullName == target + name) {
+               doCollect = false;
+               break;
+            }
+         }
+
+         // Exclude output tensor names
+         if (doCollect) {
+            for (std::string const &name : fOutputTensorNames) {
+               if (fullName == target + name) {
+                  doCollect = false;
+                  break;
+               }
+            }
+         }
+
+         if (doCollect) {
+            result.push_back(fullName);
+         }
+
+         i = j; // advance past the identifier
+      } else {
+         ++i;
+      }
+   }
+
+   // Deduplicate (order not preserved)
+   std::sort(result.begin(), result.end());
+   result.erase(std::unique(result.begin(), result.end()), result.end());
+
+   return result;
+}
+
 std::string RModel::GenerateInferSignature(bool isdecl) {
    // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2"
    // if (decl = false) generate only calling signature (tensor1,tensor2,....)
@@ -1004,8 +1105,24 @@ void RModel::GenerateOutput()
    if (!doInferArgs.empty())
       doInferArgs += ",";
    for (std::string const &name : fOutputTensorNames) {
-      fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > output_tensor_" + name + ";\n";
-      doInferArgs += " output_tensor_" + name + ",";
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      std::string n;
+      if(!isDynamic) {
+         n = std::to_string(ConvertShapeToLength(GetTensorShape(name)));
+      } else {
+         n = memberNameForDimShape(ConvertDimShapeToLength(GetDynamicTensorShape(name)));
+      }
+      std::string outputName = "output_tensor_" + name;
+      fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > " + outputName + "(" + n + ");\n";
+      doInferArgs += " " + outputName + ".data(),";
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param)) {
+               fGC += SP + "size_t " + dim.param + " = 0;\n";
+               doInferArgs += " " + dim.param + ",";
+            }
+         }
+      }
    }
    if (!doInferArgs.empty())
       doInferArgs.back() = ' ';
@@ -1031,7 +1148,21 @@ void RModel::GenerateOutput()
       }
    }
 
-   fGC += SP + "doInfer(" + doInferArgs + ");\n";
+   if (fUseSession) {
+      fGC += SP + "doInfer(*this, " + doInferArgs + ");\n";
+   } else {
+      fGC += SP + "doInfer(" + doInferArgs + ");\n";
+   }
+
+   // If the output tensors have dynamic sizes, now is the time to set them
+   for (std::string const &name : fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      if (isDynamic) {
+         std::string outputName = "output_tensor_" + name;
+         auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name));
+         fGC += " " + outputName + ".resize(" + tensor_size + ");\n";
+      }
+   }
 
    fGC += SP + "return {";
    for (size_t i = 0; i < fOutputTensorNames.size(); i++) {
@@ -1045,23 +1176,43 @@ void RModel::GenerateOutput()
 
 void RModel::GenerateSessionCode()
 {
+   std::string sessionName = !fIsSubGraph ? "Session" : "Session_" + fName;
+
+   if (fUseSession && !fIsGNNComponent) {
+      //  forward declare session struct
+      fGC += "struct " + sessionName + ";\n";
+   }
+
    // Determine the signature of the actual inference function
    std::string doInferSignature = GenerateInferSignature();
    if (!doInferSignature.empty())
       doInferSignature += ", ";
    for (auto const &name : fOutputTensorNames) {
-      doInferSignature += " std::vector<" + typeForOutput(GetTensorType(name)) + "> &output_tensor_" + name + ",";
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      doInferSignature += typeForOutput(GetTensorType(name)) + " *tensor_" + name + ",";
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param))
+               doInferSignature += " size_t &" + dim.param + "_output,";
+         }
+      }
    }
    doInferSignature.back() = ' ';
 
-   doInferSignature = "void doInfer(" + doInferSignature + ")";
+   if (fUseSession) {
+      doInferSignature = sessionName + " const &session, " + doInferSignature;
+   }
+
+   doInferSignature = "inline void doInfer(" + doInferSignature + ")";
+
+   if (!fIsGNNComponent) {
+      // forward declare inference implementation
+      fGC += doInferSignature + ";\n";
+   }
 
    // define the Session struct (for GNN this is generated in RModel_GNN)
    if (fUseSession && !fIsGNNComponent) {
-      if (!fIsSubGraph)
-         fGC += "struct Session {\n";
-      else
-         fGC += "struct Session_" + fName + " {\n";
+      fGC += "struct " + sessionName + " {\n";
    }
 
    // generate code for declaring the initialized tensors
@@ -1115,9 +1266,6 @@ void RModel::GenerateSessionCode()
 
    // Generate code for Session constructor
    if (fUseSession) {
-      std::string sessionName = "Session";
-      if (fIsSubGraph)
-         sessionName += "_" + fName;
       // add here specific operator code that needs to define session data members
       fGC += "\n";
       for (size_t id = 0; id < fOperators.size(); id++) {
@@ -1179,7 +1327,15 @@ void RModel::GenerateSessionCode()
       fGC += "}\n\n";
    }
 
-   fGC += doInferSignature + "{\n";
+   // generate the inference overload that returns an output struct
+   GenerateOutput();
+
+   // end of session
+   if (fUseSession && !fIsGNNComponent) {
+      fGC += "};   // end of Session\n\n";
+   }
+
+   fGC += doInferSignature + " {\n";
    fGC += "\n";
 
    // generate the inference code
@@ -1189,32 +1345,47 @@ void RModel::GenerateSessionCode()
    if (fOutputTensorNames.size() == 0)
       throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
 
+   std::string allOperatorCode;
+
    for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
       if (fVerbose)
          std::cout << "Generating code for operator .... " << op_idx << std::endl;
-      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+      std::string operatorCode = fOperators[op_idx]->Generate(std::to_string(op_idx));
+      allOperatorCode += operatorCode;
    }
 
-   fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
+   // If the generated code users members of the session struct, use the
+   // local variable name that we're using for the session:
+   ReplaceAll(allOperatorCode, "this->", "session.");
 
-   for (std::string const &name : fOutputTensorNames) {
-      // need to check is size is the same (don't want to return a vector with
-      // larger size) in that case better to copy
-      bool isIntermediate = fIntermediateTensorInfos.count(name) > 0;
-      std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(GetTensorShape(name)))
-                                     : ConvertDimShapeToLength(GetDimTensorShape(name));
-      fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
+   if (fUseSession && !fIsGNNComponent) {
+      // Collect all "tensor_*" data members that are not input or output tensors
+      std::vector<std::string> tensorMemberNames = CollectTensorMemberNames(allOperatorCode);
+      for (auto const& name: tensorMemberNames) {
+         fGC += "    auto &" + name + " = session." + name + ";\n";
+      }
+      fGC += "\n";
    }
 
-   fGC += "}\n\n";
-
-   // generate the inference overload that returns an output struct
-   GenerateOutput();
+   fGC += allOperatorCode;
 
-   // end of session
-   if (fUseSession && !fIsGNNComponent) {
-      fGC += "};   // end of Session\n\n";
+   for (auto const& name: fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param))
+               fGC += "   " + dim.param + "_output = " + dim.param + ";\n";
+         }
+      }
+      if(IsConstantTensor(name)) {
+         std::string t = "session.tensor_" + name;
+         size_t length = ConvertShapeToLength(fInitializedTensors[name].shape());
+         fGC += "    std::copy(" + t + ", " + t + " + " + std::to_string(length) + ", tensor_" + name + ");\n";
+      }
    }
+   fGC += "\n";
+
+   fGC += "}\n";
 }
 
 void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, long pos, bool verbose)
@@ -1648,6 +1819,4 @@ void RModel::Streamer(TBuffer &R__b) {
     }
 }
 
-}//SOFIE
-}//Experimental
-}//TMVA
+} // namespace SOFIE::Experimental::TMVA
diff --git a/tmva/sofie/test/CMakeLists.txt b/tmva/sofie/test/CMakeLists.txt
index 1a9295237cff1..120b5800b2f8d 100644
--- a/tmva/sofie/test/CMakeLists.txt
+++ b/tmva/sofie/test/CMakeLists.txt
@@ -48,14 +48,20 @@ ROOTTEST_ADD_TEST(SofieCompileModels_ONNX
 # Creating a Google Test
 if (BLAS_FOUND)  # we need BLAS for compiling the models
   ROOT_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx
-    LIBRARIES
-      Core
-      GTest::gtest
-      GTest::gtest_main
+    LIBRARIES Core GTest::gtest GTest::gtest_main
   )
   ROOTTEST_ADD_TEST(TestCustomModelsFromONNX
                     EXEC ./TestCustomModelsFromONNX
                     FIXTURES_REQUIRED sofie-compile-models-onnx)
+
+  if (clad)
+    ROOT_EXECUTABLE(TestCladAutodiff TestCladAutodiff.cxx
+      LIBRARIES Core GTest::gtest GTest::gtest_main
+    )
+    ROOTTEST_ADD_TEST(TestCladAutodiff
+                      EXEC ./TestCladAutodiff
+                      FIXTURES_REQUIRED sofie-compile-models-onnx)
+  endif()
 endif()
 
 # For testing serialisation of RModel object
diff --git a/tmva/sofie/test/TestCladAutodiff.cxx b/tmva/sofie/test/TestCladAutodiff.cxx
new file mode 100644
index 0000000000000..bdd040e4ea03f
--- /dev/null
+++ b/tmva/sofie/test/TestCladAutodiff.cxx
@@ -0,0 +1,111 @@
+constexpr auto modelHeaderSuffix = "_FromONNX.hxx";
+constexpr auto modelDataSuffix = "_FromONNX.dat";
+#include "test_helpers.h"
+
+#include "input_models/references/Linear_16.ref.hxx"
+
+#include "gtest/gtest.h"
+
+// Test differentiating a fully-connected neural network with Clad.
+// Extension of the ONNX.Linear16 test in TestCustomModelsFromONNX.cxx
+TEST(ONNXClad, Linear16)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(1600);
+   std::fill_n(input.data(), input.size(), 1.0f);
+
+   ASSERT_INCLUDE_AND_RUN(std::vector<float>, "Linear_16", input);
+
+   gInterpreter->Declare(R"(
+#include <Math/CladDerivator.h>
+
+float Linear_16_wrapper(TMVA_SOFIE_Linear_16::Session const &session, float const *input)
+{
+   float out[160]{};
+   float output_sum = 0.0;
+
+   TMVA_SOFIE_Linear_16::doInfer(session, input, out);
+
+   for (std::size_t i = 0; i < std::size(out); ++i) {
+      output_sum += out[i];
+   }
+   return output_sum;
+}
+
+float Linear_16_outer_wrapper(TMVA_SOFIE_Linear_16::Session const &session, float const *input)
+{
+   return Linear_16_wrapper(session, input);
+}
+
+float Linear_16_wrapper_num_diff(TMVA_SOFIE_Linear_16::Session const &session, float *input, std::size_t i)
+{
+   const float origVal = input[i];
+
+   const float eps = 1e-3;
+   input[i] = origVal - eps;
+   float funcValDown = Linear_16_wrapper(session, input);
+   input[i] = origVal + eps;
+   float funcValUp = Linear_16_wrapper(session, input);
+   input[i] = origVal;
+
+   return (funcValUp - funcValDown) / (2 * eps);
+}
+   )");
+
+   auto inputInterp = toInterpreter(input, "std::vector<float>", true);
+
+   // Why do we have two wrappers, the <>_wrapper and the <>_outer_wrapper?
+   // This is because we are not interested in the created gradient function.
+   // We are interested in the more low-level *pullback* function, which takes
+   // also the data structures for the reverse pass as function arguments. Like
+   // this, we can initialize the session for the backward pass once and re-use
+   // it. The trick to get the wrapper pullback is to create another wrapper
+   // around the wrapper, and creating the gradient for the outer wrapper
+   // implicitly creates the pullback for the inner wrapper.
+   gInterpreter->ProcessLine("clad::gradient(Linear_16_outer_wrapper, \"input\");");
+
+   // Create two session data structures: one for the forward, and one for the backward pass
+   gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::Session session_linear_16{\"Linear_16_FromONNX.dat\"};");
+   gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::Session _d_session_linear_16{\"Linear_16_FromONNX.dat\"};");
+
+   gInterpreter->ProcessLine("float grad_output[1600]{};");
+   gInterpreter->ProcessLine(
+      ("Linear_16_wrapper_pullback(session_linear_16, " + inputInterp + ", 1, &_d_session_linear_16, grad_output)")
+         .c_str());
+
+   // If you want to see the gradient code:
+   // gInterpreter->ProcessLine("static_cast<void (*)(TMVA_SOFIE_Linear_16::Session const &, float const *, float
+   // *)>(Linear_16_outer_wrapper_grad_1)"); gInterpreter->ProcessLine("Linear_16_wrapper_pullback");
+   // gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::doInfer_reverse_forw");
+   // gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::doInfer_pullback");
+
+   auto retVal = gInterpreter->ProcessLine((R"(
+   double maxDiff = 0;
+   for (std::size_t i = 0; i < std::size(grad_output); ++i) {
+      double val = grad_output[i];
+      double ref = Linear_16_wrapper_num_diff(session_linear_16, )" +
+                                            inputInterp + R"(, i);
+      if (val != ref) {
+         maxDiff = std::max(std::abs(val - ref), maxDiff);
+      }
+   }
+   double tol = 0.0025;
+   // the "return" value
+   (maxDiff < tol);
+   )")
+                                              .c_str());
+
+   EXPECT_EQ(retVal, 1) << "The gradient from Clad and the numeric gradient didn't match within tolerance.";
+
+   // Checking output size
+   EXPECT_EQ(output.size(), sizeof(Linear_16_ExpectedOutput::all_ones) / sizeof(float));
+
+   float *correct = Linear_16_ExpectedOutput::all_ones;
+
+   // Checking every output value, one by one
+   for (size_t i = 0; i < output.size(); ++i) {
+      EXPECT_LE(std::abs(output[i] - correct[i]), TOLERANCE);
+   }
+}