//Code generated automatically by TMVA for Inference of Model file [core] at [Sun Nov 16 02:57:27 2025] 

#ifndef ROOT_TMVA_SOFIE_CORE
#define ROOT_TMVA_SOFIE_CORE

#include <cmath>
#include <algorithm>
#include <vector>
#include "TMVA/SOFIE_common.hxx"
#include <fstream>

namespace TMVA_SOFIE_core{
namespace BLAS{
	extern "C" void saxpy_(const int * n, const float * alpha, const float * x,
	                         const int * incx, float * y, const int * incy);
	extern "C" void sgemv_(const char * trans, const int * m, const int * n, const float * alpha, const float * A,
	                       const int * lda, const float * X, const int * incx, const float * beta, const float * Y, const int * incy);
	extern "C" void sgemm_(const char * transa, const char * transb, const int * m, const int * n, const int * k,
	                       const float * alpha, const float * A, const int * lda, const float * B, const int * ldb,
	                       const float * beta, float * C, const int * ldc);
}//BLAS


namespace Edge_Update{
struct Session {

//--------- GNN_Update_Function---edge_update
// initialized tensors
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normscale0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normscale0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normscale0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0w0 = std::vector<float>(80000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0.data();

// --- Positioning intermediate tensor memory --//--- declare the dynamic tensors
std::vector<float> fTensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0;
float * tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0 = nullptr;
std::vector<float> fTensor_InvStdDevedge_updateRelu4;
float * tensor_InvStdDevedge_updateRelu4 = nullptr;
std::vector<float> fTensor_Meanedge_updateRelu4;
float * tensor_Meanedge_updateRelu4 = nullptr;
std::vector<float> fTensor_edge_updateRelu4output;
float * tensor_edge_updateRelu4output = nullptr;
std::vector<float> fTensor_edge_updateGemm0;
float * tensor_edge_updateGemm0 = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0bcast = nullptr;
std::vector<float> fTensor_edge_updateRelu0;
float * tensor_edge_updateRelu0 = nullptr;
std::vector<float> fTensor_edge_updateRelu2;
float * tensor_edge_updateRelu2 = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0bcast = nullptr;
std::vector<float> fTensor_edge_updateGemm4;
float * tensor_edge_updateGemm4 = nullptr;
std::vector<float> fTensor_edge_updateGemm1;
float * tensor_edge_updateGemm1 = nullptr;
std::vector<float> fTensor_edge_updateRelu1;
float * tensor_edge_updateRelu1 = nullptr;
std::vector<float> fTensor_edge_updateGemm2;
float * tensor_edge_updateGemm2 = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0bcast = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0bcast = nullptr;
std::vector<float> fTensor_edge_updateInputConcat;
float * tensor_edge_updateInputConcat = nullptr;
std::vector<float> fTensor_edge_updateRelu4;
float * tensor_edge_updateRelu4 = nullptr;


Session(std::string filename ="core.dat",
        size_t num_edges = 300) {

//--- reading weights from file
   std::ifstream f;
   f.open(filename);
   if (!f.is_open()) {
      throw std::runtime_error("tmva-sofie failed to open file " + filename + " for input weights");
   }
   f.seekg(0);
   using TMVA::Experimental::SOFIE::ReadTensorFromStream;
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normscale0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normscale0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0w0", 80000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0", 100);
   f.close();

   if (num_edges * 100 > 0) {
      fTensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0.resize(num_edges * 100);
      tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0 = fTensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0.data();
   }
   if (num_edges > 0) {
      fTensor_InvStdDevedge_updateRelu4.resize(num_edges);
      tensor_InvStdDevedge_updateRelu4 = fTensor_InvStdDevedge_updateRelu4.data();
   }
   if (num_edges > 0) {
      fTensor_Meanedge_updateRelu4.resize(num_edges);
      tensor_Meanedge_updateRelu4 = fTensor_Meanedge_updateRelu4.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateRelu4output.resize(num_edges * 100);
      tensor_edge_updateRelu4output = fTensor_edge_updateRelu4output.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateGemm0.resize(num_edges * 100);
      tensor_edge_updateGemm0 = fTensor_edge_updateGemm0.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0bcast.resize(num_edges * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0bcast.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateRelu0.resize(num_edges * 100);
      tensor_edge_updateRelu0 = fTensor_edge_updateRelu0.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateRelu2.resize(num_edges * 100);
      tensor_edge_updateRelu2 = fTensor_edge_updateRelu2.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0bcast.resize(num_edges * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0bcast.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateGemm4.resize(num_edges * 100);
      tensor_edge_updateGemm4 = fTensor_edge_updateGemm4.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateGemm1.resize(num_edges * 100);
      tensor_edge_updateGemm1 = fTensor_edge_updateGemm1.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateRelu1.resize(num_edges * 100);
      tensor_edge_updateRelu1 = fTensor_edge_updateRelu1.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateGemm2.resize(num_edges * 100);
      tensor_edge_updateGemm2 = fTensor_edge_updateGemm2.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0bcast.resize(num_edges * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0bcast.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0bcast.resize(num_edges * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0bcast.data();
   }
   if (num_edges * 800 > 0) {
      fTensor_edge_updateInputConcat.resize(num_edges * 800);
      tensor_edge_updateInputConcat = fTensor_edge_updateInputConcat.data();
   }
   if (num_edges * 100 > 0) {
      fTensor_edge_updateRelu4.resize(num_edges * 100);
      tensor_edge_updateRelu4 = fTensor_edge_updateRelu4.data();
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0,{ 100 }, { num_edges , 100 });
      std::copy(data, data + num_edges * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0bcast);
      delete [] data;
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0,{ 100 }, { num_edges , 100 });
      std::copy(data, data + num_edges * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0bcast);
      delete [] data;
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0,{ 100 }, { num_edges , 100 });
      std::copy(data, data + num_edges * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0bcast);
      delete [] data;
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0,{ 100 }, { num_edges , 100 });
      std::copy(data, data + num_edges * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0bcast);
      delete [] data;
   }
   // Broadcasting the bias of LayerNormalization op
   {
      float* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0, { 100 }, { num_edges , 100 });
   std::copy(data, data + num_edges * 100, tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0);
   delete[] data;
   }
}

void doInfer(size_t num_edges,float const* tensor_edge,float const* tensor_receiver,float const* tensor_sender,float const* tensor_global,  std::vector<float> &output_tensor_edge_updateRelu4output ){


//--------- Concat op_0 --> { num_edges , 800 }
   for (size_t i0 = 0; i0 < num_edges; ++i0) {
         int idxOut = 800*i0;
         int idxIn0 = 200*i0;
         for (size_t iC = 0; iC < 200; ++iC) {
            tensor_edge_updateInputConcat[idxOut+iC] = tensor_edge[idxIn0+iC];
         }
         idxOut += 200;
         int idxIn1 = 200*i0;
         for (size_t iC = 0; iC < 200; ++iC) {
            tensor_edge_updateInputConcat[idxOut+iC] = tensor_receiver[idxIn1+iC];
         }
         idxOut += 200;
         int idxIn2 = 200*i0;
         for (size_t iC = 0; iC < 200; ++iC) {
            tensor_edge_updateInputConcat[idxOut+iC] = tensor_sender[idxIn2+iC];
         }
         idxOut += 200;
         int idxIn3 = 200*i0;
         for (size_t iC = 0; iC < 200; ++iC) {
            tensor_edge_updateInputConcat[idxOut+iC] = tensor_global[idxIn3+iC];
         }
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_edge_updateGemm0, false, false, 100, num_edges, 800, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0w0, tensor_edge_updateInputConcat, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_0b0bcast);

//------ RELU
   for (int id = 0; id < num_edges * 100 ; id++){
      tensor_edge_updateRelu0[id] = ((tensor_edge_updateGemm0[id] > 0 )? tensor_edge_updateGemm0[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_edge_updateGemm1, false, false, 100, num_edges, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1w0, tensor_edge_updateRelu0, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_1b0bcast);

//------ RELU
   for (int id = 0; id < num_edges * 100 ; id++){
      tensor_edge_updateRelu1[id] = ((tensor_edge_updateGemm1[id] > 0 )? tensor_edge_updateGemm1[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_edge_updateGemm2, false, false, 100, num_edges, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2w0, tensor_edge_updateRelu1, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_2b0bcast);

//------ RELU
   for (int id = 0; id < num_edges * 100 ; id++){
      tensor_edge_updateRelu2[id] = ((tensor_edge_updateGemm2[id] > 0 )? tensor_edge_updateGemm2[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_edge_updateGemm4, false, false, 100, num_edges, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3w0, tensor_edge_updateRelu2, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blockmlplinear_3b0bcast);

//------ RELU
   for (int id = 0; id < num_edges * 100 ; id++){
      tensor_edge_updateRelu4[id] = ((tensor_edge_updateGemm4[id] > 0 )? tensor_edge_updateGemm4[id] : 0);
   }
//---- Layer Normalization  operator op_9
   // Compute the mean
   for (size_t axis_0 = 0; axis_0 < num_edges; axis_0++) {
      float sum = 0.;
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++) {
         sum += tensor_edge_updateRelu4[axis_0 * 100 + axis_1 * 1];
      }
      tensor_Meanedge_updateRelu4[axis_0 * 1] = sum / float(100);
   }
   // Compute the inverse Standard Deviation
   for (size_t axis_0 = 0; axis_0 < num_edges; axis_0++){
      float sum = 0.;
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++){
         float tmp = tensor_edge_updateRelu4[axis_0 * 100 + axis_1 * 1] - tensor_Meanedge_updateRelu4[axis_0 * 1];
         sum += tmp*tmp;
      }
      tensor_InvStdDevedge_updateRelu4[axis_0 * 1] = 1 / std::sqrt(sum / float(100) + 1e-05);
   }
   // Y = Scale o InvStdDev (X - Mean)
   for (size_t axis_0 = 0; axis_0 < num_edges; axis_0++){
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++){
         tensor_edge_updateRelu4output[axis_0 * 100 + axis_1 * 1] = tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normscale0[axis_1 * 1] * tensor_InvStdDevedge_updateRelu4[axis_0 * 1] * (tensor_edge_updateRelu4[axis_0 * 100 + axis_1 * 1] - tensor_Meanedge_updateRelu4[axis_0 * 1]);
      }
   }
   // Add the bias to Y
   int op_9_n = num_edges * 100;
   float op_9_alpha = 1.;
   int op_9_inc = 1;
   BLAS::saxpy_(&op_9_n, &op_9_alpha, tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networkedge_blocklayer_normoffset0, &op_9_inc, tensor_edge_updateRelu4output, &op_9_inc);
   using TMVA::Experimental::SOFIE::UTILITY::FillOutput;

   FillOutput(tensor_edge_updateRelu4output, output_tensor_edge_updateRelu4output, num_edges * 100);
}



std::vector<float> infer(size_t num_edges,float const* tensor_edge,float const* tensor_receiver,float const* tensor_sender,float const* tensor_global){
   std::vector<float > output_tensor_edge_updateRelu4output;
   doInfer(num_edges,tensor_edge,tensor_receiver,tensor_sender,tensor_global, output_tensor_edge_updateRelu4output );
   return {output_tensor_edge_updateRelu4output};
}
};
}


namespace Node_Update{
struct Session {

//--------- GNN_Update_Function---node_update
// initialized tensors
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normscale0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normscale0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normscale0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0w0 = std::vector<float>(50000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0.data();

// --- Positioning intermediate tensor memory --//--- declare the dynamic tensors
std::vector<float> fTensor_Meannode_updateRelu4;
float * tensor_Meannode_updateRelu4 = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0bcast = nullptr;
std::vector<float> fTensor_node_updateGemm0;
float * tensor_node_updateGemm0 = nullptr;
std::vector<float> fTensor_node_updateRelu1;
float * tensor_node_updateRelu1 = nullptr;
std::vector<float> fTensor_node_updateRelu0;
float * tensor_node_updateRelu0 = nullptr;
std::vector<float> fTensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0;
float * tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0 = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0bcast = nullptr;
std::vector<float> fTensor_node_updateGemm1;
float * tensor_node_updateGemm1 = nullptr;
std::vector<float> fTensor_InvStdDevnode_updateRelu4;
float * tensor_InvStdDevnode_updateRelu4 = nullptr;
std::vector<float> fTensor_node_updateRelu4output;
float * tensor_node_updateRelu4output = nullptr;
std::vector<float> fTensor_node_updateGemm2;
float * tensor_node_updateGemm2 = nullptr;
std::vector<float> fTensor_node_updateInputConcat;
float * tensor_node_updateInputConcat = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0bcast = nullptr;
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0bcast;
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0bcast = nullptr;
std::vector<float> fTensor_node_updateRelu2;
float * tensor_node_updateRelu2 = nullptr;
std::vector<float> fTensor_node_updateGemm4;
float * tensor_node_updateGemm4 = nullptr;
std::vector<float> fTensor_node_updateRelu4;
float * tensor_node_updateRelu4 = nullptr;


Session(std::string filename ="core.dat",
        size_t num_nodes = 100) {

//--- reading weights from file
   std::ifstream f;
   f.open(filename);
   if (!f.is_open()) {
      throw std::runtime_error("tmva-sofie failed to open file " + filename + " for input weights");
   }
   f.seekg(1489556);
   using TMVA::Experimental::SOFIE::ReadTensorFromStream;
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normscale0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normscale0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0w0", 50000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0", 100);
   f.close();

   if (num_nodes > 0) {
      fTensor_Meannode_updateRelu4.resize(num_nodes);
      tensor_Meannode_updateRelu4 = fTensor_Meannode_updateRelu4.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0bcast.resize(num_nodes * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0bcast.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateGemm0.resize(num_nodes * 100);
      tensor_node_updateGemm0 = fTensor_node_updateGemm0.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateRelu1.resize(num_nodes * 100);
      tensor_node_updateRelu1 = fTensor_node_updateRelu1.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateRelu0.resize(num_nodes * 100);
      tensor_node_updateRelu0 = fTensor_node_updateRelu0.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0.resize(num_nodes * 100);
      tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0 = fTensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0bcast.resize(num_nodes * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0bcast.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateGemm1.resize(num_nodes * 100);
      tensor_node_updateGemm1 = fTensor_node_updateGemm1.data();
   }
   if (num_nodes > 0) {
      fTensor_InvStdDevnode_updateRelu4.resize(num_nodes);
      tensor_InvStdDevnode_updateRelu4 = fTensor_InvStdDevnode_updateRelu4.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateRelu4output.resize(num_nodes * 100);
      tensor_node_updateRelu4output = fTensor_node_updateRelu4output.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateGemm2.resize(num_nodes * 100);
      tensor_node_updateGemm2 = fTensor_node_updateGemm2.data();
   }
   if (num_nodes * 500 > 0) {
      fTensor_node_updateInputConcat.resize(num_nodes * 500);
      tensor_node_updateInputConcat = fTensor_node_updateInputConcat.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0bcast.resize(num_nodes * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0bcast.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0bcast.resize(num_nodes * 100);
      tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0bcast = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0bcast.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateRelu2.resize(num_nodes * 100);
      tensor_node_updateRelu2 = fTensor_node_updateRelu2.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateGemm4.resize(num_nodes * 100);
      tensor_node_updateGemm4 = fTensor_node_updateGemm4.data();
   }
   if (num_nodes * 100 > 0) {
      fTensor_node_updateRelu4.resize(num_nodes * 100);
      tensor_node_updateRelu4 = fTensor_node_updateRelu4.data();
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0,{ 100 }, { num_nodes , 100 });
      std::copy(data, data + num_nodes * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0bcast);
      delete [] data;
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0,{ 100 }, { num_nodes , 100 });
      std::copy(data, data + num_nodes * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0bcast);
      delete [] data;
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0,{ 100 }, { num_nodes , 100 });
      std::copy(data, data + num_nodes * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0bcast);
      delete [] data;
   }
//--- broadcast bias tensor EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0for Gemm op
   {
      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0,{ 100 }, { num_nodes , 100 });
      std::copy(data, data + num_nodes * 100, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0bcast);
      delete [] data;
   }
   // Broadcasting the bias of LayerNormalization op
   {
      float* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0, { 100 }, { num_nodes , 100 });
   std::copy(data, data + num_nodes * 100, tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0);
   delete[] data;
   }
}

void doInfer(size_t num_nodes,float const* tensor_edge,float const* tensor_node,float const* tensor_global,  std::vector<float> &output_tensor_node_updateRelu4output ){


//--------- Concat op_0 --> { num_nodes , 500 }
   for (size_t i0 = 0; i0 < num_nodes; ++i0) {
         int idxOut = 500*i0;
         int idxIn0 = 100*i0;
         for (size_t iC = 0; iC < 100; ++iC) {
            tensor_node_updateInputConcat[idxOut+iC] = tensor_edge[idxIn0+iC];
         }
         idxOut += 100;
         int idxIn1 = 200*i0;
         for (size_t iC = 0; iC < 200; ++iC) {
            tensor_node_updateInputConcat[idxOut+iC] = tensor_node[idxIn1+iC];
         }
         idxOut += 200;
         int idxIn2 = 200*i0;
         for (size_t iC = 0; iC < 200; ++iC) {
            tensor_node_updateInputConcat[idxOut+iC] = tensor_global[idxIn2+iC];
         }
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_node_updateGemm0, false, false, 100, num_nodes, 500, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0w0, tensor_node_updateInputConcat, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_0b0bcast);

//------ RELU
   for (int id = 0; id < num_nodes * 100 ; id++){
      tensor_node_updateRelu0[id] = ((tensor_node_updateGemm0[id] > 0 )? tensor_node_updateGemm0[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_node_updateGemm1, false, false, 100, num_nodes, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1w0, tensor_node_updateRelu0, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_1b0bcast);

//------ RELU
   for (int id = 0; id < num_nodes * 100 ; id++){
      tensor_node_updateRelu1[id] = ((tensor_node_updateGemm1[id] > 0 )? tensor_node_updateGemm1[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_node_updateGemm2, false, false, 100, num_nodes, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2w0, tensor_node_updateRelu1, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_2b0bcast);

//------ RELU
   for (int id = 0; id < num_nodes * 100 ; id++){
      tensor_node_updateRelu2[id] = ((tensor_node_updateGemm2[id] > 0 )? tensor_node_updateGemm2[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_node_updateGemm4, false, false, 100, num_nodes, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3w0, tensor_node_updateRelu2, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blockmlplinear_3b0bcast);

//------ RELU
   for (int id = 0; id < num_nodes * 100 ; id++){
      tensor_node_updateRelu4[id] = ((tensor_node_updateGemm4[id] > 0 )? tensor_node_updateGemm4[id] : 0);
   }
//---- Layer Normalization  operator op_9
   // Compute the mean
   for (size_t axis_0 = 0; axis_0 < num_nodes; axis_0++) {
      float sum = 0.;
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++) {
         sum += tensor_node_updateRelu4[axis_0 * 100 + axis_1 * 1];
      }
      tensor_Meannode_updateRelu4[axis_0 * 1] = sum / float(100);
   }
   // Compute the inverse Standard Deviation
   for (size_t axis_0 = 0; axis_0 < num_nodes; axis_0++){
      float sum = 0.;
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++){
         float tmp = tensor_node_updateRelu4[axis_0 * 100 + axis_1 * 1] - tensor_Meannode_updateRelu4[axis_0 * 1];
         sum += tmp*tmp;
      }
      tensor_InvStdDevnode_updateRelu4[axis_0 * 1] = 1 / std::sqrt(sum / float(100) + 1e-05);
   }
   // Y = Scale o InvStdDev (X - Mean)
   for (size_t axis_0 = 0; axis_0 < num_nodes; axis_0++){
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++){
         tensor_node_updateRelu4output[axis_0 * 100 + axis_1 * 1] = tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normscale0[axis_1 * 1] * tensor_InvStdDevnode_updateRelu4[axis_0 * 1] * (tensor_node_updateRelu4[axis_0 * 100 + axis_1 * 1] - tensor_Meannode_updateRelu4[axis_0 * 1]);
      }
   }
   // Add the bias to Y
   int op_9_n = num_nodes * 100;
   float op_9_alpha = 1.;
   int op_9_inc = 1;
   BLAS::saxpy_(&op_9_n, &op_9_alpha, tensor_BroadcastedEncodeProcessDecodeMLPGraphNetworkgraph_networknode_blocklayer_normoffset0, &op_9_inc, tensor_node_updateRelu4output, &op_9_inc);
   using TMVA::Experimental::SOFIE::UTILITY::FillOutput;

   FillOutput(tensor_node_updateRelu4output, output_tensor_node_updateRelu4output, num_nodes * 100);
}



std::vector<float> infer(size_t num_nodes,float const* tensor_edge,float const* tensor_node,float const* tensor_global){
   std::vector<float > output_tensor_node_updateRelu4output;
   doInfer(num_nodes,tensor_edge,tensor_node,tensor_global, output_tensor_node_updateRelu4output );
   return {output_tensor_node_updateRelu4output};
}
};
}


namespace Global_Update{
struct Session {

//--------- GNN_Update_Function---global_update
// initialized tensors
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normscale0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normscale0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normscale0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normoffset0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normoffset0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normoffset0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1w0 = std::vector<float>(10000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1b0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0w0 = std::vector<float>(40000);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0w0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0w0.data();
std::vector<float> fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0b0 = std::vector<float>(100);
float * tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0b0 = fTensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0b0.data();

//--- Allocating session memory pool to be used for allocating intermediate tensors
std::vector<char> fIntermediateMemoryPool = std::vector<char>(5200);


// --- Positioning intermediate tensor memory --
 // Allocating memory for intermediate tensor global_updateInputConcat with size 1600 bytes
float* tensor_global_updateInputConcat = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 0);

 // Allocating memory for intermediate tensor global_updateGemm0 with size 400 bytes
float* tensor_global_updateGemm0 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 1600);

 // Allocating memory for intermediate tensor global_updateRelu0 with size 400 bytes
float* tensor_global_updateRelu0 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 2000);

 // Allocating memory for intermediate tensor global_updateGemm1 with size 400 bytes
float* tensor_global_updateGemm1 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 2400);

 // Allocating memory for intermediate tensor global_updateRelu1 with size 400 bytes
float* tensor_global_updateRelu1 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 2800);

 // Allocating memory for intermediate tensor global_updateGemm2 with size 400 bytes
float* tensor_global_updateGemm2 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 3200);

 // Allocating memory for intermediate tensor global_updateRelu2 with size 400 bytes
float* tensor_global_updateRelu2 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 3600);

 // Allocating memory for intermediate tensor global_updateGemm4 with size 400 bytes
float* tensor_global_updateGemm4 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 4000);

 // Allocating memory for intermediate tensor global_updateRelu4 with size 400 bytes
float* tensor_global_updateRelu4 = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 4400);

 // Allocating memory for intermediate tensor global_updateRelu4output with size 400 bytes
float* tensor_global_updateRelu4output = reinterpret_cast<float*>(fIntermediateMemoryPool.data() + 4800);

//--- declare and allocate the intermediate tensors
std::vector<float> fTensor_Meanglobal_updateRelu4 = std::vector<float>(1);
float * tensor_Meanglobal_updateRelu4 = fTensor_Meanglobal_updateRelu4.data();
std::vector<float> fTensor_InvStdDevglobal_updateRelu4 = std::vector<float>(1);
float * tensor_InvStdDevglobal_updateRelu4 = fTensor_InvStdDevglobal_updateRelu4.data();


Session(std::string filename ="core.dat") {

//--- reading weights from file
   std::ifstream f;
   f.open(filename);
   if (!f.is_open()) {
      throw std::runtime_error("tmva-sofie failed to open file " + filename + " for input weights");
   }
   f.seekg(2566957);
   using TMVA::Experimental::SOFIE::ReadTensorFromStream;
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normscale0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normscale0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normoffset0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normoffset0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1w0", 10000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1b0", 100);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0w0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0w0", 40000);
   ReadTensorFromStream(f, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0b0, "tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0b0", 100);
   f.close();

}

void doInfer(float const* tensor_edge,float const* tensor_node,float const* tensor_global,  std::vector<float> &output_tensor_global_updateRelu4output ){


//--------- Concat op_0 --> { 1 , 400 }
   std::copy(tensor_edge, tensor_edge+100, tensor_global_updateInputConcat);
   std::copy(tensor_node, tensor_node+100, tensor_global_updateInputConcat + 100);
   std::copy(tensor_global, tensor_global+200, tensor_global_updateInputConcat + 100 + 100);

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_global_updateGemm0, false, false, 100, 1, 400, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0w0, tensor_global_updateInputConcat, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_0b0);

//------ RELU
   for (int id = 0; id < 100 ; id++){
      tensor_global_updateRelu0[id] = ((tensor_global_updateGemm0[id] > 0 )? tensor_global_updateGemm0[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_global_updateGemm1, false, false, 100, 1, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1w0, tensor_global_updateRelu0, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_1b0);

//------ RELU
   for (int id = 0; id < 100 ; id++){
      tensor_global_updateRelu1[id] = ((tensor_global_updateGemm1[id] > 0 )? tensor_global_updateGemm1[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_global_updateGemm2, false, false, 100, 1, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2w0, tensor_global_updateRelu1, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_2b0);

//------ RELU
   for (int id = 0; id < 100 ; id++){
      tensor_global_updateRelu2[id] = ((tensor_global_updateGemm2[id] > 0 )? tensor_global_updateGemm2[id] : 0);
   }

//--------- Gemm
   TMVA::Experimental::SOFIE::Gemm_Call(tensor_global_updateGemm4, false, false, 100, 1, 100, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3w0, tensor_global_updateRelu2, 1,tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blockmlplinear_3b0);

//------ RELU
   for (int id = 0; id < 100 ; id++){
      tensor_global_updateRelu4[id] = ((tensor_global_updateGemm4[id] > 0 )? tensor_global_updateGemm4[id] : 0);
   }
//---- Layer Normalization  operator op_9
   // Compute the mean
   for (size_t axis_0 = 0; axis_0 < 1; axis_0++) {
      float sum = 0.;
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++) {
         sum += tensor_global_updateRelu4[axis_0 * 100 + axis_1 * 1];
      }
      tensor_Meanglobal_updateRelu4[axis_0 * 1] = sum / float(100);
   }
   // Compute the inverse Standard Deviation
   for (size_t axis_0 = 0; axis_0 < 1; axis_0++){
      float sum = 0.;
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++){
         float tmp = tensor_global_updateRelu4[axis_0 * 100 + axis_1 * 1] - tensor_Meanglobal_updateRelu4[axis_0 * 1];
         sum += tmp*tmp;
      }
      tensor_InvStdDevglobal_updateRelu4[axis_0 * 1] = 1 / std::sqrt(sum / float(100) + 1e-05);
   }
   // Y = Scale o InvStdDev (X - Mean)
   for (size_t axis_0 = 0; axis_0 < 1; axis_0++){
      for (size_t axis_1 = 0; axis_1 < 100; axis_1++){
         tensor_global_updateRelu4output[axis_0 * 100 + axis_1 * 1] = tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normscale0[axis_1 * 1] * tensor_InvStdDevglobal_updateRelu4[axis_0 * 1] * (tensor_global_updateRelu4[axis_0 * 100 + axis_1 * 1] - tensor_Meanglobal_updateRelu4[axis_0 * 1]);
      }
   }
   // Add the bias to Y
   int op_9_n = 100;
   float op_9_alpha = 1.;
   int op_9_inc = 1;
   BLAS::saxpy_(&op_9_n, &op_9_alpha, tensor_EncodeProcessDecodeMLPGraphNetworkgraph_networkglobal_blocklayer_normoffset0, &op_9_inc, tensor_global_updateRelu4output, &op_9_inc);
   using TMVA::Experimental::SOFIE::UTILITY::FillOutput;

   FillOutput(tensor_global_updateRelu4output, output_tensor_global_updateRelu4output, 100);
}



std::vector<float> infer(float const* tensor_edge,float const* tensor_node,float const* tensor_global){
   std::vector<float > output_tensor_global_updateRelu4output;
   doInfer(tensor_edge,tensor_node,tensor_global, output_tensor_global_updateRelu4output );
   return {output_tensor_global_updateRelu4output};
}
};
}

//--------- GNN_Aggregate_Function---Aggregate_by_Sum
std::vector<float> Aggregate_by_Sum(const int& num_features, const std::vector<float*>& inputs){
	std::vector<float> result(num_features,0);
	for(auto &it:inputs){
		std::transform(result.begin(), result.end(), it, result.begin(), std::plus<float>());
	}
	return result;
}

struct Session {

// Instantiating session objects for graph components
Edge_Update::Session edge_update;
Node_Update::Session node_update;
Global_Update::Session global_update;

std::vector<float> fEdgeUpdates = std::vector<float>(300*100);


std::vector<float> fNodeUpdates = std::vector<float>(100*100);

// input vectors for edge update
std::vector<float> fEdgeInputs = std::vector<float>(300*200);
std::vector<float> fRecNodeInputs = std::vector<float>(300*200);
std::vector<float> fSndNodeInputs = std::vector<float>(300*200);
std::vector<float> fGlobInputs = std::vector<float>(300*200);


// input vectors for node update
std::vector<float> fNodeInputs = std::vector<float>(100*200);
std::vector<float> fNodeEdgeAggregate = std::vector<float>(100*200, 0);
std::vector<float> fNodeAggregateTemp;

void infer(TMVA::Experimental::SOFIE::GNN_Data& input_graph){

// --- Edge Update ---
size_t n_edges = input_graph.edge_data.GetShape()[0];
if (n_edges > 300)
   throw std::runtime_error("Number of input edges larger than 300" );

auto receivers = input_graph.edge_index.GetData();
auto senders = input_graph.edge_index.GetData() + n_edges;
for (size_t k = 0; k < n_edges; k++) { 
   std::copy(input_graph.edge_data.GetData() + k * 200, input_graph.edge_data.GetData() + (k + 1) * 200, fEdgeInputs.begin() + k * 200);
   std::copy(input_graph.node_data.GetData() + receivers[k] * 200, input_graph.node_data.GetData() + (receivers[k] + 1) * 200, fRecNodeInputs.begin() + k * 200);
   std::copy(input_graph.node_data.GetData() + senders[k] * 200, input_graph.node_data.GetData() + (senders[k] + 1) * 200, fSndNodeInputs.begin() + k * 200);
   std::copy(input_graph.global_data.GetData(), input_graph.global_data.GetData() + 200, fGlobInputs.begin() + k * 200);
}
fEdgeUpdates = edge_update.infer(n_edges,fEdgeInputs.data(), fRecNodeInputs.data(), fSndNodeInputs.data(), fGlobInputs.data());

//  resize edge graph data since output feature size is not equal to input size
input_graph.edge_data = input_graph.edge_data.Resize({n_edges, 100});

for (size_t k = 0; k < n_edges; k++) { 
   std::copy(fEdgeUpdates.begin()+ k * 100, fEdgeUpdates.begin()+ (k+1) * 100,input_graph.edge_data.GetData() + k * 100);
}



// --- Node Update ---
size_t n_nodes = input_graph.node_data.GetShape()[0];
for (size_t k = 0; k < n_nodes; k++) { 
   std::copy(input_graph.node_data.GetData() + k * 200, input_graph.node_data.GetData() + (k + 1) * 200, fNodeInputs.begin() + k * 200);
}

std::fill(fNodeEdgeAggregate.begin(), fNodeEdgeAggregate.end(), 0.);

// resize global vector feature to number of nodes if needed
if (n_nodes > n_edges) {
   fGlobInputs.resize( n_nodes * 200);
   for (size_t k = n_edges; k < n_nodes; k++)
      std::copy(fGlobInputs.begin(), fGlobInputs.begin() + 200 , fGlobInputs.begin() + k * 200);
}

// aggregate edges going to a node
for (size_t j = 0; j < n_nodes; j++) {
   std::vector<float *> edgesData; edgesData.reserve( int(n_edges/n_nodes) +1);
   for (size_t k = 0; k < n_edges; k++) {
      if (receivers[k] == j) 
         edgesData.emplace_back(input_graph.edge_data.GetData() + k * 100);
   }
   fNodeAggregateTemp = Aggregate_by_Sum(100,edgesData);
   std::copy(fNodeAggregateTemp.begin(), fNodeAggregateTemp.end(), fNodeEdgeAggregate.begin() + 100 * j);
}

fNodeUpdates = node_update.infer(n_nodes,fNodeEdgeAggregate.data(),fNodeInputs.data(),fGlobInputs.data());

//  resize node graph data since output feature size is not equal to input size
input_graph.node_data = input_graph.node_data.Resize({n_nodes, 100});

for (size_t k = 0; k < n_nodes; k++) { 
   std::copy(fNodeUpdates.begin()+ k * 100, fNodeUpdates.begin() + (k+1) * 100,input_graph.node_data.GetData() + k * 100);
}

std::vector<float *> allEdgesData; allEdgesData.reserve(n_edges);
for (size_t k = 0; k < n_edges; k++) {
   allEdgesData.emplace_back(input_graph.edge_data.GetData() + k * 100);
}
std::vector<float *> allNodesData; allNodesData.reserve(n_nodes);
for (size_t k = 0; k < n_nodes; k++) {
   allNodesData.emplace_back(input_graph.node_data.GetData() + k * 100);
}

// --- Global Update ---
std::vector<float> Edge_Global_Aggregate = Aggregate_by_Sum(100,allEdgesData);
std::vector<float> Node_Global_Aggregate = Aggregate_by_Sum(100,allNodesData);
std::vector<float> Global_Data = global_update.infer(Edge_Global_Aggregate.data(),Node_Global_Aggregate.data(),input_graph.global_data.GetData());
//  resize global graph data since output feature size is not equal to input size
input_graph.global_data = input_graph.global_data.Resize({100});

std::copy(Global_Data.begin(), Global_Data.end(), input_graph.global_data.GetData());
}
};
} //TMVA_SOFIE_core

#endif  // TMVA_SOFIE_ROOT_TMVA_SOFIE_CORE
