diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx
index bc6493090f74e..3395de6d34166 100644
--- a/tmva/sofie/inc/TMVA/RModel.hxx
+++ b/tmva/sofie/inc/TMVA/RModel.hxx
@@ -35,8 +35,6 @@ private:
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames; // input tensor names using ONNX order
 
-
-
    std::vector<std::unique_ptr<ROperator>> fOperators;
 
    std::vector<std::shared_ptr<RModel>> fSubGraphs;    ///<!  sub-graph models (transient)
@@ -196,6 +194,8 @@ protected:
    void GenerateIntermediateMemoryPool();
    // Generate all session code
    void GenerateSessionCode();
+   bool IsInputTensorShapeParam(std::string const &name) const;
+   std::vector<std::string> CollectTensorMemberNames(const std::string &input);
 
 public:
    const std::vector<std::string> & GetInputTensorNames() const { return fInputTensorNames; }
diff --git a/tmva/sofie/inc/TMVA/ROperator.hxx b/tmva/sofie/inc/TMVA/ROperator.hxx
index 6fac7958f8f9d..f0afd9c4374c1 100644
--- a/tmva/sofie/inc/TMVA/ROperator.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator.hxx
@@ -25,9 +25,7 @@ public:
    virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) { return {}; };
    virtual void Initialize(RModel&) = 0;
    virtual std::string Generate(std::string OpName) = 0;  //expect unique opName for each operator within the same RModel
-   // generate code for Session constructor before tensor allocation
-   virtual std::string GenerateSessionCtorCode() { return "";}
-   // generate initialization code for session constructor after tensor allocations
+   // generate initialization code for session constructor
    virtual std::string GenerateInitCode() { return "";}
    // generate some specific declaration code for Session
    virtual std::string GenerateDeclCode() { return "";}
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index ecdd0b435fe37..83381baa39f0c 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -393,9 +393,12 @@ namespace SOFIE{
              << (fAttrTransB ? "true, " : "false, ")
              << (fAttrTransA ? "true, " : "false, ")
              << n << ", " << m << ", " << k << ", ";
-            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", tensor_" << fNB;
+            // TODO: the cast to (float *) is not needed here from the C++ language perspective (the arguments to
+            // Gemm_Call are const already), but Clad bug https://github.com/vgvassilev/clad/issues/1721 is requiring
+            // us to do this cast to keep Clad working. Remove this hack once the Clad issue is fixed.
+            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", (float*)tensor_" << fNB;
             if (extraB) out << " + " << opName << "_B_offset";
-            out << ", tensor_" << fNA;
+            out << ", (float*)tensor_" << fNA; // TODO: same here
             if (extraA) out << " + " << opName << "_A_offset";
             out << ", " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
             // in the case of bias and no broadcasting needed
diff --git a/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx b/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx
index 84f37bc57da7e..ae0ee70c4eeea 100644
--- a/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_LSTM.hxx
@@ -390,45 +390,62 @@ std::string ROperator_LSTM<T>::GenerateSessionMembersCode(std::string opName)
    size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0];
    size_t input_size = fShapeX[2];
 
+   struct Block {
+      std::string name;
+      size_t size;
+   };
+
+   std::vector<Block> blocks;
+
+   size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
+   size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
+
+   // Layout-dependent buffers
    if (fAttrLayout != 0) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">("
-          << seq_length * batch_size * input_size << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
+      blocks.push_back({"input", seq_length * batch_size * input_size});
+      blocks.push_back({"initial_hidden_state", num_directions * batch_size * fAttrHiddenSize});
+      blocks.push_back({"initial_cell_state", num_directions * batch_size * fAttrHiddenSize});
    }
-   // Set the feedforward
-   size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_ff_input_gate = std::vector<" << fType << ">(" << ff_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_ff_output_gate = std::vector<" << fType << ">(" << ff_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_ff_cell_gate = std::vector<" << fType << ">(" << ff_size
-       << ");\n";
+
+   // Feedforward gates
+   blocks.push_back({"ff_input_gate", ff_size});
+   blocks.push_back({"ff_output_gate", ff_size});
+   blocks.push_back({"ff_cell_gate", ff_size});
    if (fAttrInputForget == 0)
-      out << "std::vector<" << fType << "> fVec_" << opName << "_ff_forget_gate = std::vector<" << fType << ">("
-          << ff_size << ");\n";
-   // gate results
-   size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_input_gate = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_output_gate = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_cell_gate = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
+      blocks.push_back({"ff_forget_gate", ff_size});
+
+   // Gate outputs
+   blocks.push_back({"input_gate", hs_size});
+   blocks.push_back({"output_gate", hs_size});
+   blocks.push_back({"cell_gate", hs_size});
    if (fAttrInputForget == 0)
-      out << "std::vector<" << fType << "> fVec_" << opName << "_forget_gate = std::vector<" << fType << ">(" << hs_size
-          << ");\n";
-   // cell state
-   out << "std::vector<" << fType << "> fVec_" << opName << "_cell_state = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_new_cell_state = std::vector<" << fType << ">(" << hs_size
-       << ");\n";
-   // hiddden state
+      blocks.push_back({"forget_gate", hs_size});
+
+   // Cell state
+   blocks.push_back({"cell_state", hs_size});
+   blocks.push_back({"new_cell_state", hs_size});
+
+   // Hidden state (conditional)
    if (fAttrLayout != 0 || fNY.empty()) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">("
-          << hs_size << ");\n";
+      blocks.push_back({"hidden_state", hs_size});
+   }
+
+   // Compute total size
+   size_t total_size = 0;
+   for (const auto &b : blocks) {
+      total_size += b.size;
+   }
+
+   // Backing storage
+   out << "std::vector<" << fType << "> fVec_" << opName << "_buffer = std::vector<" << fType << ">(" << total_size
+       << ");\n";
+
+   // Emit pointers
+   std::size_t offset = 0;
+   for (const auto &b : blocks) {
+      out << fType << "* fVec_" << opName << "_" << b.name << " = fVec_" << opName << "_buffer.data() + " << offset
+          << ";\n";
+      offset += b.size;
    }
 
    out << "\n";
@@ -452,7 +469,7 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
       out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession)
-         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input.data();\n";
+         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input;\n";
       else
          out << SP << fType << "  " << OpName << "_input[" << seq_length * batch_size * input_size << "] = {0};\n";
 
@@ -470,11 +487,11 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the initial hidden state
    if (!fNInitial_h.empty()) {
       if (fAttrLayout == 0) {
-         out << SP << fType << " *" << OpName << "_initial_hidden_state = " << " tensor_" << fNInitial_h << ";\n";
+         out << SP << fType << " const*" << OpName << "_initial_hidden_state = " << " tensor_" << fNInitial_h << ";\n";
       } else {
          if (fUseSession)
-            out << SP << fType << " * " << OpName << "_initial_hidden_state = this->fVec_" << OpName
-                << "_initial_hidden_state.data();\n";
+            out << SP << fType << " const* " << OpName << "_initial_hidden_state = this->fVec_" << OpName
+                << "_initial_hidden_state;\n";
          else
             out << SP << fType << "  " << OpName << "_initial_hidden_state["
                 << num_directions * batch_size * fAttrHiddenSize << "] = {0};\n";
@@ -494,11 +511,11 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the initial cell state
    if (!fNInitial_c.empty()) {
       if (fAttrLayout == 0) {
-         out << SP << fType << " *" << OpName << "_initial_cell_state = " << " tensor_" << fNInitial_c << ";\n";
+         out << SP << fType << " const*" << OpName << "_initial_cell_state = " << " tensor_" << fNInitial_c << ";\n";
       } else {
          if (fUseSession)
-            out << SP << fType << " * " << OpName << "_initial_cell_state = this->fVec_" << OpName
-                << "_initial_cell_state.data();\n";
+            out << SP << fType << " const* " << OpName << "_initial_cell_state = this->fVec_" << OpName
+                << "_initial_cell_state;\n";
          else
             out << SP << fType << "  " << OpName << "_initial_cell_state["
                 << num_directions * batch_size * fAttrHiddenSize << "] = {0};\n";
@@ -518,12 +535,12 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the feedforward
    size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_ff_input_gate = this->fVec_" << OpName << "_ff_input_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_ff_output_gate = this->fVec_" << OpName << "_ff_output_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_ff_cell_gate = this->fVec_" << OpName << "_ff_cell_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_ff_input_gate = this->fVec_" << OpName << "_ff_input_gate;\n";
+      out << SP << fType << " * " << OpName << "_ff_output_gate = this->fVec_" << OpName << "_ff_output_gate;\n";
+      out << SP << fType << " * " << OpName << "_ff_cell_gate = this->fVec_" << OpName << "_ff_cell_gate;\n";
       if (fAttrInputForget == 0) {
          out << SP << fType << " * " << OpName << "_ff_forget_gate = this->fVec_" << OpName
-             << "_ff_forget_gate.data();\n";
+             << "_ff_forget_gate;\n";
       }
    } else {
       out << SP << fType << "  " << OpName << "_ff_input_gate[" << ff_size << "] = {0};\n";
@@ -536,11 +553,11 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    // Set the gates
    size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_input_gate = this->fVec_" << OpName << "_input_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_output_gate = this->fVec_" << OpName << "_output_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_cell_gate = this->fVec_" << OpName << "_cell_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_input_gate = this->fVec_" << OpName << "_input_gate;\n";
+      out << SP << fType << " * " << OpName << "_output_gate = this->fVec_" << OpName << "_output_gate;\n";
+      out << SP << fType << " * " << OpName << "_cell_gate = this->fVec_" << OpName << "_cell_gate;\n";
       if (fAttrInputForget == 0) {
-         out << SP << fType << " * " << OpName << "_forget_gate = this->fVec_" << OpName << "_forget_gate.data();\n";
+         out << SP << fType << " * " << OpName << "_forget_gate = this->fVec_" << OpName << "_forget_gate;\n";
       }
    } else {
       out << SP << fType << "  " << OpName << "_input_gate[" << hidden_state_size << "] = {0};\n";
@@ -552,8 +569,8 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
    }
    // Set the cell state and the new cell state = h(cell state)
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_cell_state = this->fVec_" << OpName << "_cell_state.data();\n";
-      out << SP << fType << " * " << OpName << "_new_cell_state = this->fVec_" << OpName << "_new_cell_state.data();\n";
+      out << SP << fType << " * " << OpName << "_cell_state = this->fVec_" << OpName << "_cell_state;\n";
+      out << SP << fType << " * " << OpName << "_new_cell_state = this->fVec_" << OpName << "_new_cell_state;\n";
    } else {
       out << SP << fType << "  " << OpName << "_cell_state[" << hidden_state_size << "] = {0};\n";
       out << SP << fType << "  " << OpName << "_new_cell_state[" << hidden_state_size << "] = {0};\n";
@@ -564,7 +581,7 @@ auto ROperator_LSTM<T>::Generate(std::string OpName) -> std::string
       out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state.data();\n";
+         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state;\n";
       } else {
          out << SP << fType << "  " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n";
       }
diff --git a/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx b/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
index fdf04665e0315..8587035f8d44b 100644
--- a/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_NonZero.hxx
@@ -92,8 +92,8 @@ public:
          fShapeY.resize(2);
          fShapeY[0] = fShapeX.size();
 
-         // flag -1 to define the shape variable in the constructor code and not in the constructor signature
-         fShapeY[1] = Dim{std::string("v_NonZero_") + fNX, static_cast<size_t>(-1) };
+         // identify as -1 since we will declare maximum as size of input
+         fShapeY[1] = Dim{std::string("v_NonZero_") + fNX, static_cast<size_t>(-1)};
 
          model.AddIntermediateTensor(fNY, ETensorType::INT64, fShapeY);
          if (model.Verbose()) {
@@ -101,16 +101,16 @@ public:
          }
       }
    }
-   std::string GenerateSessionCtorCode() override {
+
+   std::string GenerateSessionMembersCode(std::string /*opName*/) override {
       if (fIsOutputConstant) return "";
       // define output value used as max non zero with max size = input shape * N
       auto inputLength = ConvertDimShapeToLength(fShapeX);
       std::stringstream out;
-      out << SP << "size_t v_NonZero_" << fNX << " = " << inputLength << ";\n";
+      out << SP << "size_t fV_NonZero_" << fNX << " = " << inputLength << ";\n";
       return out.str();
    }
 
-
    std::string Generate(std::string opName) override {
       if (fIsOutputConstant) {
          return "";
diff --git a/tmva/sofie/inc/TMVA/ROperator_RNN.hxx b/tmva/sofie/inc/TMVA/ROperator_RNN.hxx
index 0667047eed228..f385a502d4077 100644
--- a/tmva/sofie/inc/TMVA/ROperator_RNN.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_RNN.hxx
@@ -308,18 +308,38 @@ std::string ROperator_RNN<T>::GenerateSessionMembersCode(std::string opName)
    size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0];
    size_t input_size = fShapeX[2];
 
+   struct Block {
+      std::string name;
+      size_t size;
+   };
+
+   std::vector<Block> blocks;
+
    if (fAttrLayout != 0) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">("
-          << seq_length * batch_size * input_size << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
+      blocks.push_back({"input", seq_length * batch_size * input_size});
+      blocks.push_back({"initial_hidden_state", num_directions * batch_size * fAttrHiddenSize});
    }
-   out << "std::vector<" << fType << "> fVec_" << opName << "_feedforward = std::vector<" << fType << ">("
-       << seq_length * batch_size * fAttrHiddenSize << ");\n";
-
+   blocks.push_back({"feedforward", seq_length * batch_size * fAttrHiddenSize});
    if (fAttrLayout != 0 || fNY.empty()) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">("
-          << seq_length * num_directions * batch_size * fAttrHiddenSize << ");\n";
+      blocks.push_back({"hidden_state", seq_length * num_directions * batch_size * fAttrHiddenSize});
+   }
+
+   // Compute total size
+   size_t total_size = 0;
+   for (const auto &b : blocks) {
+      total_size += b.size;
+   }
+
+   // Emit backing storage
+   out << "std::vector<" << fType << "> fVec_" << opName << "_buffer = std::vector<" << fType << ">(" << total_size
+       << ");\n";
+
+   // Emit pointers
+   std::size_t offset = 0;
+   for (const auto &b : blocks) {
+      out << fType << "* fVec_" << opName << "_" << b.name << " = fVec_" << opName << "_buffer.data() + " << offset
+          << ";\n";
+      offset += b.size;
    }
 
    out << "\n";
@@ -346,7 +366,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
       }
    } else {
       if (fUseSession)
-         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input.data();\n";
+         out << SP << fType << " * " << OpName << "_input = this->fVec_" << OpName << "_input;\n";
       else
          out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n";
       out << SP << "for(size_t seq = 0; seq < " << seq_length << "; seq++) {\n";
@@ -367,7 +387,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
       } else {
          if (fUseSession)
             out << SP << fType << " * " << OpName << "_initial_hidden_state = this->fVec_" << OpName
-                << "_initial_hidden_state.data();\n";
+                << "_initial_hidden_state;\n";
          else
             out << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size * fAttrHiddenSize
                 << "] = {0};\n";
@@ -385,7 +405,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
    }
 
    if (fUseSession)
-      out << SP << fType << " * " << OpName << "_feedforward = this->fVec_" << OpName << "_feedforward.data();\n";
+      out << SP << fType << " * " << OpName << "_feedforward = this->fVec_" << OpName << "_feedforward;\n";
    else
       out << SP << fType << " " << OpName << "_feedforward[" << seq_length * batch_size * fAttrHiddenSize
           << "] = {0};\n";
@@ -395,7 +415,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName) -> std::string
       out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n";
    } else {
       if (fUseSession)
-         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state.data();\n";
+         out << SP << fType << " * " << OpName << "_hidden_state = this->fVec_" << OpName << "_hidden_state;\n";
       else
          out << SP << fType << " " << OpName << "_hidden_state["
              << seq_length * num_directions * batch_size * fAttrHiddenSize << "] = {0};\n";
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
index fddc07a85fc08..8769193080b39 100644
--- a/tmva/sofie/inc/TMVA/SOFIE_common.hxx
+++ b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -681,16 +681,6 @@ void col2im(const Dtype* data_col, const int channels,
   //std::cout << "finishing col2imp" << std::endl;
 }
 
-// Used at the end of infer() to fill the return object.
-template <class T>
-void FillOutput(T const *arr, std::vector<T> &out, std::size_t n)
-{
-   out.resize(n);
-   for (std::size_t i = 0; i < n; ++i) {
-      out[i] = arr[i];
-   }
-}
-
 }  // end namespace UTILITY
 
 namespace BLAS{
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 0e6a7d7cf9bbf..74e672779fcf5 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -11,14 +11,34 @@
 #include "TMVA/RModel.hxx"
 #include "TMVA/SOFIE_common.hxx"
 
-namespace TMVA {
-namespace Experimental {
-namespace SOFIE {
+namespace TMVA::Experimental::SOFIE {
 
 namespace {
+
 const std::string SP = "   ";
+
+void ReplaceAll(std::string &str, const std::string &from, const std::string &to)
+{
+   size_t pos = 0;
+   while ((pos = str.find(from, pos)) != std::string::npos) {
+      str.replace(pos, from.length(), to);
+      pos += to.length();
+   }
+}
+
+bool IsIdentifierChar(char c)
+{
+   return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
 }
 
+// Get the data member name corresponding to a tensor with a given name.
+std::string TensorMember(std::string const &name)
+{
+   return "tensor_" + name;
+}
+
+} // namespace
+
 std::underlying_type_t<Options> operator|(Options opA, Options opB) {
     return static_cast<std::underlying_type_t<Options>>(opA) | static_cast<std::underlying_type_t<Options>>(opB);
 }
@@ -26,6 +46,7 @@ std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, O
     return opA | static_cast<std::underlying_type_t<Options>>(opB);
 }
 
+
 std::vector<size_t> RModel::GetTensorShape(const std::string & name) const {
     auto f = fReadyInputTensorInfos.find(name);
     if (f != fReadyInputTensorInfos.end()) {
@@ -356,7 +377,7 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
       std::string typeName = ConvertTypeToString(GetTensorType(name));
       code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes";
       code << "\n"
-           << typeName << "* tensor_" << name << " = reinterpret_cast<" << typeName
+           << typeName << "* " << TensorMember(name) << " = reinterpret_cast<" << typeName
            << "*>(fIntermediateMemoryPool.data() + " << location << ");\n";
    };
 
@@ -714,7 +735,8 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
       } while (sameData && idx < length);
    }
    if (allocateOnStack) {
-      strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      strs << type << " fTensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ";\n";
    } else {
       strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
       if (sameData)
@@ -722,7 +744,7 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
       else {
          strs << ConvertValuesToString(length, data) << ";\n";
       }
-      strs << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
+      strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ".data();\n";
    }
    return strs.str();
 }
@@ -735,22 +757,22 @@ void RModel::GenerateInitializedTensorInfo()
    // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors )
    for (auto &i : fInitializedTensors) {
       if (i.second.IsNotWritable())  continue;
+      size_t length = ConvertShapeToLength(i.second.shape());
       if (!fUseWeightFile || i.second.IsConstantTensor() || !i.second.IsWeightTensor() ) {
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += GenerateConstantTensorCode<float>(i);
-            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
+            fConstantTensorSize += length * sizeof(float);
          } else if (i.second.type() == ETensorType::INT64) {
             fGC += GenerateConstantTensorCode<int64_t>(i);
-            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 8;
+            fConstantTensorSize += length * sizeof(int64_t);
          }
 
       } else {
          // case of tensors which are read from a file
-         size_t length = ConvertShapeToLength(i.second.shape());
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
-            fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
-            fWeightsTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
+            fGC += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
+            fWeightsTensorSize += length * sizeof(float);
          }
       }
    }
@@ -774,7 +796,7 @@ void RModel::GenerateIntermediateTensorInfo() {
          bool  is_alias = (IsAliasTensor(i.first));
          if (i.second.type == ETensorType::BOOL && !is_alias) {
                tensor_declaration_block += "std::vector<std::uint8_t> fTensor_" + i.first + " = std::vector<std::uint8_t>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
-               tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "std::uint8_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                continue;
          }
          bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended);
@@ -788,22 +810,22 @@ void RModel::GenerateIntermediateTensorInfo() {
 
             if (i.second.type == ETensorType::FLOAT) {
                tensor_declaration_block += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                fOtherTensorSize += 4 * length;
             }
             else if (i.second.type == ETensorType::DOUBLE) {
                tensor_declaration_block += "std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "double * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                fOtherTensorSize += 8 * length;
             }
             else if (i.second.type == ETensorType::INT64) {
                tensor_declaration_block += "std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               tensor_declaration_block += "int64_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
                fOtherTensorSize += 8 * length;
             }
          }
          if (is_alias) {
-             tensor_declaration_block += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
+             tensor_declaration_block += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n";
          }
 
       }
@@ -816,7 +838,7 @@ void RModel::GenerateIntermediateTensorInfo() {
    if (!fDynamicTensorInfos.empty()) {
       fGC += "//--- declare the dynamic tensors\n";
       for (auto &i : fDynamicTensorInfos) {
-         fGC += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
+         fGC += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n";
       }
       fGC += "//--- dynamic tensors pool\n";
       fGC += "std::vector<char> fDynamicMemoryPool;\n";
@@ -862,9 +884,10 @@ void RModel::GenerateDynamicTensorInfo()
             auto op_ptr = op.get();
             std::cout << "Looping on operator " << op_index << "   " << typeid(*op_ptr).name() << std::endl;
          }
-         // check if is a dynamic tensor and not an alias tensor
+         // check if is a dynamic tensor and not an alias tensor or output tensor
          std::string name = std::string(it);
-         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)) {
+         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)
+              && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end()) {
             auto tensor_size =  ConvertDimShapeToLength(GetDimTensorShape(name));
             auto type = GetTensorType(name);
             size_t type_size = GetTypeSize(type);
@@ -901,6 +924,7 @@ void RModel::GenerateDynamicTensorInfo()
    bool missingTensor = false;
    for (auto &i : fDynamicTensorInfos) {
       if (IsAliasTensor(i.first)) continue;
+      if (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) != fOutputTensorNames.end()) continue;
       if (std::find(tensors.begin(), tensors.end(), std::pair<std::string,ETensorType>{i.first, i.second.type}) == tensors.end()) {
          std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl;
          missingTensor = true;
@@ -912,6 +936,83 @@ void RModel::GenerateDynamicTensorInfo()
    fGC += out.str();
 }
 
+/// Check if a given parameter is used for the shape of an input tensor.
+bool RModel::IsInputTensorShapeParam(std::string const &paramName) const
+{
+   for (auto &name : fInputTensorNames) {
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            if (d.param == paramName)
+               return true;
+         }
+      }
+   }
+   return false;
+}
+
+/// Collects all identifiers starting with "tensor_" in the input code,
+/// provided that the occurrence is not immediately preceded by a
+/// character that is valid in a C++ identifier. Excludes input and output tensor names.
+/// Returns a deduplicated std::vector<std::string>.
+std::vector<std::string> RModel::CollectTensorMemberNames(const std::string &input)
+{
+   const std::string target = "tensor_";
+
+   std::vector<std::string> result;
+
+   for (size_t i = 0; i < input.size();) {
+
+      bool doCollect = false;
+
+      if (i + target.size() <= input.size() && input.compare(i, target.size(), target) == 0 &&
+          (i == 0 || !IsIdentifierChar(input[i - 1]))) {
+
+         doCollect = true;
+
+         std::size_t j = i + target.size();
+
+         // Extend to full identifier
+         while (j < input.size() && IsIdentifierChar(input[j]))
+            ++j;
+
+         std::string fullName = input.substr(i, j - i);
+
+         // Exclude input tensor names
+         for (std::string const &name : fInputTensorNames) {
+            if (fullName == target + name) {
+               doCollect = false;
+               break;
+            }
+         }
+
+         // Exclude output tensor names
+         if (doCollect) {
+            for (std::string const &name : fOutputTensorNames) {
+               if (fullName == target + name) {
+                  doCollect = false;
+                  break;
+               }
+            }
+         }
+
+         if (doCollect) {
+            result.push_back(fullName);
+         }
+
+         i = j; // advance past the identifier
+      } else {
+         ++i;
+      }
+   }
+
+   // Deduplicate (order not preserved)
+   std::sort(result.begin(), result.end());
+   result.erase(std::unique(result.begin(), result.end()), result.end());
+
+   return result;
+}
+
 std::string RModel::GenerateInferSignature(bool isdecl) {
    // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2"
    // if (decl = false) generate only calling signature (tensor1,tensor2,....)
@@ -1004,8 +1105,24 @@ void RModel::GenerateOutput()
    if (!doInferArgs.empty())
       doInferArgs += ",";
    for (std::string const &name : fOutputTensorNames) {
-      fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > output_tensor_" + name + ";\n";
-      doInferArgs += " output_tensor_" + name + ",";
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      std::string n;
+      if(!isDynamic) {
+         n = std::to_string(ConvertShapeToLength(GetTensorShape(name)));
+      } else {
+         n = memberNameForDimShape(ConvertDimShapeToLength(GetDynamicTensorShape(name)));
+      }
+      std::string outputName = "output_tensor_" + name;
+      fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > " + outputName + "(" + n + ");\n";
+      doInferArgs += " " + outputName + ".data(),";
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param)) {
+               fGC += SP + "size_t " + dim.param + " = 0;\n";
+               doInferArgs += " " + dim.param + ",";
+            }
+         }
+      }
    }
    if (!doInferArgs.empty())
       doInferArgs.back() = ' ';
@@ -1031,7 +1148,21 @@ void RModel::GenerateOutput()
       }
    }
 
-   fGC += SP + "doInfer(" + doInferArgs + ");\n";
+   if (fUseSession) {
+      fGC += SP + "doInfer(*this, " + doInferArgs + ");\n";
+   } else {
+      fGC += SP + "doInfer(" + doInferArgs + ");\n";
+   }
+
+   // If the output tensors have dynamic sizes, now is the time to set them
+   for (std::string const &name : fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      if (isDynamic) {
+         std::string outputName = "output_tensor_" + name;
+         auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name));
+         fGC += " " + outputName + ".resize(" + tensor_size + ");\n";
+      }
+   }
 
    fGC += SP + "return {";
    for (size_t i = 0; i < fOutputTensorNames.size(); i++) {
@@ -1045,23 +1176,43 @@ void RModel::GenerateOutput()
 
 void RModel::GenerateSessionCode()
 {
+   std::string sessionName = !fIsSubGraph ? "Session" : "Session_" + fName;
+
+   if (fUseSession && !fIsGNNComponent) {
+      //  forward declare session struct
+      fGC += "struct " + sessionName + ";\n";
+   }
+
    // Determine the signature of the actual inference function
    std::string doInferSignature = GenerateInferSignature();
    if (!doInferSignature.empty())
       doInferSignature += ", ";
    for (auto const &name : fOutputTensorNames) {
-      doInferSignature += " std::vector<" + typeForOutput(GetTensorType(name)) + "> &output_tensor_" + name + ",";
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      doInferSignature += typeForOutput(GetTensorType(name)) + " *tensor_" + name + ",";
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param))
+               doInferSignature += " size_t &" + dim.param + "_output,";
+         }
+      }
    }
    doInferSignature.back() = ' ';
 
-   doInferSignature = "void doInfer(" + doInferSignature + ")";
+   if (fUseSession) {
+      doInferSignature = sessionName + " const &session, " + doInferSignature;
+   }
+
+   doInferSignature = "inline void doInfer(" + doInferSignature + ")";
+
+   if (!fIsGNNComponent) {
+      // forward declare inference implementation
+      fGC += doInferSignature + ";\n";
+   }
 
    // define the Session struct (for GNN this is generated in RModel_GNN)
    if (fUseSession && !fIsGNNComponent) {
-      if (!fIsSubGraph)
-         fGC += "struct Session {\n";
-      else
-         fGC += "struct Session_" + fName + " {\n";
+      fGC += "struct " + sessionName + " {\n";
    }
 
    // generate code for declaring the initialized tensors
@@ -1115,9 +1266,6 @@ void RModel::GenerateSessionCode()
 
    // Generate code for Session constructor
    if (fUseSession) {
-      std::string sessionName = "Session";
-      if (fIsSubGraph)
-         sessionName += "_" + fName;
       // add here specific operator code that needs to define session data members
       fGC += "\n";
       for (size_t id = 0; id < fOperators.size(); id++) {
@@ -1152,11 +1300,6 @@ void RModel::GenerateSessionCode()
       }
       fGC += ") {\n";
 
-      // add some code required in session constructor
-      for (size_t id = 0; id < fOperators.size(); id++) {
-         fGC += fOperators[id]->GenerateSessionCtorCode();
-      }
-
       // initializing dynamic parameters
       if (!fDimShapeNames.empty()) {
          fGC += "\n\n";
@@ -1184,7 +1327,15 @@ void RModel::GenerateSessionCode()
       fGC += "}\n\n";
    }
 
-   fGC += doInferSignature + "{\n";
+   // generate the inference overload that returns an output struct
+   GenerateOutput();
+
+   // end of session
+   if (fUseSession && !fIsGNNComponent) {
+      fGC += "};   // end of Session\n\n";
+   }
+
+   fGC += doInferSignature + " {\n";
    fGC += "\n";
 
    // generate the inference code
@@ -1194,32 +1345,47 @@ void RModel::GenerateSessionCode()
    if (fOutputTensorNames.size() == 0)
       throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
 
+   std::string allOperatorCode;
+
    for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
       if (fVerbose)
          std::cout << "Generating code for operator .... " << op_idx << std::endl;
-      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+      std::string operatorCode = fOperators[op_idx]->Generate(std::to_string(op_idx));
+      allOperatorCode += operatorCode;
    }
 
-   fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
+   // If the generated code users members of the session struct, use the
+   // local variable name that we're using for the session:
+   ReplaceAll(allOperatorCode, "this->", "session.");
 
-   for (std::string const &name : fOutputTensorNames) {
-      // need to check is size is the same (don't want to return a vector with
-      // larger size) in that case better to copy
-      bool isIntermediate = fIntermediateTensorInfos.count(name) > 0;
-      std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(GetTensorShape(name)))
-                                     : ConvertDimShapeToLength(GetDimTensorShape(name));
-      fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
+   if (fUseSession && !fIsGNNComponent) {
+      // Collect all "tensor_*" data members that are not input or output tensors
+      std::vector<std::string> tensorMemberNames = CollectTensorMemberNames(allOperatorCode);
+      for (auto const& name: tensorMemberNames) {
+         fGC += "    auto &" + name + " = session." + name + ";\n";
+      }
+      fGC += "\n";
    }
 
-   fGC += "}\n\n";
-
-   // generate the inference overload that returns an output struct
-   GenerateOutput();
+   fGC += allOperatorCode;
 
-   // end of session
-   if (fUseSession && !fIsGNNComponent) {
-      fGC += "};   // end of Session\n\n";
+   for (auto const& name: fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param))
+               fGC += "   " + dim.param + "_output = " + dim.param + ";\n";
+         }
+      }
+      if(IsConstantTensor(name)) {
+         std::string t = "session.tensor_" + name;
+         size_t length = ConvertShapeToLength(fInitializedTensors[name].shape());
+         fGC += "    std::copy(" + t + ", " + t + " + " + std::to_string(length) + ", tensor_" + name + ");\n";
+      }
    }
+   fGC += "\n";
+
+   fGC += "}\n";
 }
 
 void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, long pos, bool verbose)
@@ -1653,6 +1819,4 @@ void RModel::Streamer(TBuffer &R__b) {
     }
 }
 
-}//SOFIE
-}//Experimental
-}//TMVA
+} // namespace SOFIE::Experimental::TMVA
diff --git a/tmva/sofie/test/CMakeLists.txt b/tmva/sofie/test/CMakeLists.txt
index 1a9295237cff1..120b5800b2f8d 100644
--- a/tmva/sofie/test/CMakeLists.txt
+++ b/tmva/sofie/test/CMakeLists.txt
@@ -48,14 +48,20 @@ ROOTTEST_ADD_TEST(SofieCompileModels_ONNX
 # Creating a Google Test
 if (BLAS_FOUND)  # we need BLAS for compiling the models
   ROOT_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx
-    LIBRARIES
-      Core
-      GTest::gtest
-      GTest::gtest_main
+    LIBRARIES Core GTest::gtest GTest::gtest_main
   )
   ROOTTEST_ADD_TEST(TestCustomModelsFromONNX
                     EXEC ./TestCustomModelsFromONNX
                     FIXTURES_REQUIRED sofie-compile-models-onnx)
+
+  if (clad)
+    ROOT_EXECUTABLE(TestCladAutodiff TestCladAutodiff.cxx
+      LIBRARIES Core GTest::gtest GTest::gtest_main
+    )
+    ROOTTEST_ADD_TEST(TestCladAutodiff
+                      EXEC ./TestCladAutodiff
+                      FIXTURES_REQUIRED sofie-compile-models-onnx)
+  endif()
 endif()
 
 # For testing serialisation of RModel object
diff --git a/tmva/sofie/test/TestCladAutodiff.cxx b/tmva/sofie/test/TestCladAutodiff.cxx
new file mode 100644
index 0000000000000..bdd040e4ea03f
--- /dev/null
+++ b/tmva/sofie/test/TestCladAutodiff.cxx
@@ -0,0 +1,111 @@
+constexpr auto modelHeaderSuffix = "_FromONNX.hxx";
+constexpr auto modelDataSuffix = "_FromONNX.dat";
+#include "test_helpers.h"
+
+#include "input_models/references/Linear_16.ref.hxx"
+
+#include "gtest/gtest.h"
+
+// Test differentiating a fully-connected neural network with Clad.
+// Extension of the ONNX.Linear16 test in TestCustomModelsFromONNX.cxx
+TEST(ONNXClad, Linear16)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(1600);
+   std::fill_n(input.data(), input.size(), 1.0f);
+
+   ASSERT_INCLUDE_AND_RUN(std::vector<float>, "Linear_16", input);
+
+   gInterpreter->Declare(R"(
+#include <Math/CladDerivator.h>
+
+float Linear_16_wrapper(TMVA_SOFIE_Linear_16::Session const &session, float const *input)
+{
+   float out[160]{};
+   float output_sum = 0.0;
+
+   TMVA_SOFIE_Linear_16::doInfer(session, input, out);
+
+   for (std::size_t i = 0; i < std::size(out); ++i) {
+      output_sum += out[i];
+   }
+   return output_sum;
+}
+
+float Linear_16_outer_wrapper(TMVA_SOFIE_Linear_16::Session const &session, float const *input)
+{
+   return Linear_16_wrapper(session, input);
+}
+
+float Linear_16_wrapper_num_diff(TMVA_SOFIE_Linear_16::Session const &session, float *input, std::size_t i)
+{
+   const float origVal = input[i];
+
+   const float eps = 1e-3;
+   input[i] = origVal - eps;
+   float funcValDown = Linear_16_wrapper(session, input);
+   input[i] = origVal + eps;
+   float funcValUp = Linear_16_wrapper(session, input);
+   input[i] = origVal;
+
+   return (funcValUp - funcValDown) / (2 * eps);
+}
+   )");
+
+   auto inputInterp = toInterpreter(input, "std::vector<float>", true);
+
+   // Why do we have two wrappers, the <>_wrapper and the <>_outer_wrapper?
+   // This is because we are not interested in the created gradient function.
+   // We are interested in the more low-level *pullback* function, which takes
+   // also the data structures for the reverse pass as function arguments. Like
+   // this, we can initialize the session for the backward pass once and re-use
+   // it. The trick to get the wrapper pullback is to create another wrapper
+   // around the wrapper, and creating the gradient for the outer wrapper
+   // implicitly creates the pullback for the inner wrapper.
+   gInterpreter->ProcessLine("clad::gradient(Linear_16_outer_wrapper, \"input\");");
+
+   // Create two session data structures: one for the forward, and one for the backward pass
+   gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::Session session_linear_16{\"Linear_16_FromONNX.dat\"};");
+   gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::Session _d_session_linear_16{\"Linear_16_FromONNX.dat\"};");
+
+   gInterpreter->ProcessLine("float grad_output[1600]{};");
+   gInterpreter->ProcessLine(
+      ("Linear_16_wrapper_pullback(session_linear_16, " + inputInterp + ", 1, &_d_session_linear_16, grad_output)")
+         .c_str());
+
+   // If you want to see the gradient code:
+   // gInterpreter->ProcessLine("static_cast<void (*)(TMVA_SOFIE_Linear_16::Session const &, float const *, float
+   // *)>(Linear_16_outer_wrapper_grad_1)"); gInterpreter->ProcessLine("Linear_16_wrapper_pullback");
+   // gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::doInfer_reverse_forw");
+   // gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::doInfer_pullback");
+
+   auto retVal = gInterpreter->ProcessLine((R"(
+   double maxDiff = 0;
+   for (std::size_t i = 0; i < std::size(grad_output); ++i) {
+      double val = grad_output[i];
+      double ref = Linear_16_wrapper_num_diff(session_linear_16, )" +
+                                            inputInterp + R"(, i);
+      if (val != ref) {
+         maxDiff = std::max(std::abs(val - ref), maxDiff);
+      }
+   }
+   double tol = 0.0025;
+   // the "return" value
+   (maxDiff < tol);
+   )")
+                                              .c_str());
+
+   EXPECT_EQ(retVal, 1) << "The gradient from Clad and the numeric gradient didn't match within tolerance.";
+
+   // Checking output size
+   EXPECT_EQ(output.size(), sizeof(Linear_16_ExpectedOutput::all_ones) / sizeof(float));
+
+   float *correct = Linear_16_ExpectedOutput::all_ones;
+
+   // Checking every output value, one by one
+   for (size_t i = 0; i < output.size(); ++i) {
+      EXPECT_LE(std::abs(output[i] - correct[i]), TOLERANCE);
+   }
+}