godot/thirdparty/oidn/core/network.cpp

// ======================================================================== //
// Copyright 2009-2019 Intel Corporation                                    //
//                                                                          //
// Licensed under the Apache License, Version 2.0 (the "License");          //
// you may not use this file except in compliance with the License.         //
// You may obtain a copy of the License at                                  //
//                                                                          //
//     http://www.apache.org/licenses/LICENSE-2.0                           //
//                                                                          //
// Unless required by applicable law or agreed to in writing, software      //
// distributed under the License is distributed on an "AS IS" BASIS,        //
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
// See the License for the specific language governing permissions and      //
// limitations under the License.                                           //
// ======================================================================== //

#include "upsample.h"
#include "weights_reorder.h"
#include "network.h"
// -- GODOT start -- 
#include <cstring>
// -- GODOT end --

namespace oidn {

  template<int K>
  Network<K>::Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap)
    : device(device),
      eng(engine::cpu, 0),
      sm(eng),
      weightMap(weightMap)
  {
  }

  template<int K>
  void Network<K>::execute(const Progress& progress, int taskIndex)
  {
    if (progress.func)
    {
      const double value = double(taskIndex) / double(progress.taskCount);
      if (!progress.func(progress.userPtr, value))
        throw Exception(Error::Cancelled, "execution was cancelled");
    }

    for (size_t i = 0; i < nodes.size(); ++i)
    {
      nodes[i]->execute(sm);

      if (progress.func)
      {
        const double value = (double(taskIndex) + double(i+1) / double(nodes.size())) / double(progress.taskCount);
        if (!progress.func(progress.userPtr, value))
          throw Exception(Error::Cancelled, "execution was cancelled");
      }
    }
  }

  template<int K>
  std::shared_ptr<memory> Network<K>::allocTensor(const memory::dims& dims,
                                                  memory::format_tag format,
                                                  void* data)
  {
    if (format == memory::format_tag::any)
    {
      if (dims.size() == 4)
        format = BlockedFormat<K>::nChwKc;
      else if (dims.size() == 1)
        format = memory::format_tag::x;
      else
        assert(0);
    }
    memory::desc desc(dims, memory::data_type::f32, format);
    if (data == nullptr)
    {
      const size_t bytes = getTensorSize(dims) * sizeof(float);
      if (format == BlockedFormat<K>::nChwKc)
        activationAllocBytes += bytes;
      totalAllocBytes += bytes;

      return std::make_shared<memory>(desc, eng);
    }
    else
    {
      return std::make_shared<memory>(desc, eng, data);
    }
  }

  template<int K>
  std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
                                                 const std::shared_ptr<memory>& src,
                                                 size_t srcOffset,
                                                 memory::format_tag format)
  {
    const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
    MAYBE_UNUSED(srcDesc);
    assert(srcDesc.data_type == memory::data_type::f32);
    assert(getTensorSize(src) >= srcOffset + getTensorSize(dims));

    if (format == memory::format_tag::any)
    {
      if (dims.size() == 4)
        format = BlockedFormat<K>::nChwKc;
      else if (dims.size() == 1)
        format = memory::format_tag::x;
      else
        assert(0);
    }
    memory::desc desc(dims, memory::data_type::f32, format);
    float* srcPtr = (float*)src->get_data_handle() + srcOffset;
    return std::make_shared<memory>(desc, eng, srcPtr);
  }

  template<int K>
  std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
                                                 const std::shared_ptr<memory>& src,
                                                 const memory::dims& srcOffset)
  {
    return castTensor(dims, src, getTensorSize(srcOffset));
  }

  template<int K>
  void Network<K>::zeroTensor(const std::shared_ptr<memory>& dst)
  {
    assert(getTensorType(dst) == memory::data_type::f32);
    memset(dst->get_data_handle(), 0, getTensorSize(dst)*sizeof(float));
  }

  template<int K>
  memory::dims Network<K>::getInputReorderDims(const memory::dims& srcDims, int alignment)
  {
    memory::dims dstDims = srcDims;
    dstDims[1] = getPadded<K>(srcDims[1]); // round up C
    dstDims[2] = roundUp(srcDims[2], memory::dim(alignment)); // round up H
    dstDims[3] = roundUp(srcDims[3], memory::dim(alignment)); // round up W
    return dstDims;
  }

  template<int K>
  std::shared_ptr<Node> Network<K>::addInputReorder(const Image& color,
                                                    const Image& albedo,
                                                    const Image& normal,
                                                    const std::shared_ptr<TransferFunction>& transferFunc,
                                                    int alignment,
                                                    const std::shared_ptr<memory>& userDst)
  {
    assert(color);
    int inputC = 3;
    if (albedo) inputC += 3;
    if (normal) inputC += 3;

    memory::dims srcDims = {1, inputC, color.height, color.width};
    memory::dims dstDims = getInputReorderDims(srcDims, alignment);

    // Allocate padded memory
    auto dst = userDst;
    if (!dst)
      dst = allocTensor(dstDims);

    // Push node
    std::shared_ptr<Node> node;

    if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
      node = std::make_shared<InputReorderNode<K, LinearTransferFunction>>(color, albedo, normal, dst, tf);
    else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
      node = std::make_shared<InputReorderNode<K, GammaTransferFunction>>(color, albedo, normal, dst, tf);
    else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
      node = std::make_shared<InputReorderNode<K, LogTransferFunction>>(color, albedo, normal, dst, tf);
    else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
      node = std::make_shared<InputReorderNode<K, PQXTransferFunction>>(color, albedo, normal, dst, tf);
    else
      assert(0);

    nodes.push_back(node);
    return node;
  }

  template<int K>
  std::shared_ptr<Node> Network<K>::addOutputReorder(const std::shared_ptr<memory>& src,
                                                     const std::shared_ptr<TransferFunction>& transferFunc,
                                                     const Image& output)
  {
    memory::dims srcDims = getTensorDims(src);
    assert(srcDims[1] == K);

    // Push node
    std::shared_ptr<Node> node;

    if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
      node = std::make_shared<OutputReorderNode<K, LinearTransferFunction>>(src, output, tf);
    else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
      node = std::make_shared<OutputReorderNode<K, GammaTransferFunction>>(src, output, tf);
    else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
      node = std::make_shared<OutputReorderNode<K, LogTransferFunction>>(src, output, tf);
    else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
      node = std::make_shared<OutputReorderNode<K, PQXTransferFunction>>(src, output, tf);
    else
      assert(0);

    nodes.push_back(node);
    return node;
  }

  template<int K>
  memory::dims Network<K>::getConvDims(const std::string& name, const memory::dims& srcDims)
  {
    auto b = weightMap[name + "/b"];
    memory::dims dstDims = srcDims;
    dstDims[1] = getPadded<K>(b.dims[0]); // dstDims[C] = getPadded(OC)
    return dstDims;
  }

  template<int K>
  std::shared_ptr<Node> Network<K>::addConv(const std::string& name,
                                            const std::shared_ptr<memory>& src,
                                            const std::shared_ptr<memory>& userDst,
                                            bool relu)
  {
    const memory::dims strides = {1, 1};
    const memory::dims padding = {1, 1};

    memory::dims srcDims = getTensorDims(src);

    // Get the weights
    const auto& W = weightMap[name + "/W"];
    if (W.ndims() != 4 || W.format != "oihw")
      throw Exception(Error::InvalidOperation, "invalid convolution weights");
    memory::dims weightsDims = W.dims;
    auto userWeights = allocTensor(weightsDims, memory::format_tag::oihw, W.data);

    // Pad the weights
    memory::dims weightsPadDims = weightsDims;
    weightsPadDims[1] = getPadded<K>(weightsDims[1]); // IC
    weightsPadDims[0] = getPadded<K>(weightsDims[0]); // OC
    assert(srcDims[1] == weightsPadDims[1]); // srcDims[C] == weightsPadDims[IC]
    auto weightsPad = allocTensor(weightsPadDims, memory::format_tag::oihw);
    WeightsReorderNode<K>(userWeights, weightsPad).execute(sm);

    // Get the biases
    const auto& b = weightMap[name + "/b"];
    if (b.ndims() != 1)
      throw Exception(Error::InvalidOperation, "invalid convolution biases");
    memory::dims biasDims = b.dims;

    // Copy/pad the biases
    memory::dims biasPadDims = {getPadded<K>(biasDims[0])};
    auto bias = allocTensor(biasPadDims);
    if (biasDims[0] != biasPadDims[0])
      memset(bias->get_data_handle(), 0, biasPadDims[0]*sizeof(float));
    memcpy(bias->get_data_handle(), b.data, biasDims[0]*sizeof(float));

    // Allocate memory for destination
    memory::dims dstDims = srcDims;
    dstDims[1] = weightsPadDims[0]; // dstDims[C] = weightsPadDims[OC]

    std::shared_ptr<memory> dst;
    if (!userDst)
      dst = allocTensor(dstDims);
    else if (getTensorDims(userDst) == dstDims)
      dst = userDst;
    else
      dst = castTensor(dstDims, userDst);

    // Create a convolution
    // Let the convolution primitive choose the weights format
    auto weightsDesc = memory::desc({ weightsPadDims }, memory::data_type::f32, memory::format_tag::any);

    auto convAlgo = (K == 16) ? convolution_winograd : convolution_direct;
    auto convDesc = convolution_forward::desc(
      prop_kind::forward_inference, convAlgo,
      src->get_desc(),
      weightsDesc,
      bias->get_desc(),
      dst->get_desc(),
      strides, padding, padding, padding_kind::zero);

    // Incorporate relu
    mkldnn::primitive_attr convAttr;
    if (relu)
    {
      mkldnn::post_ops ops;
      ops.append_eltwise(
        1.f,   // scale factor, not used
        algorithm::eltwise_relu,
        0.f,   // max with
        0.f    // unused
      );
      convAttr.set_post_ops(ops);
    }
    convAttr.set_scratchpad_mode(scratchpad_mode_user);

    auto convPrimDesc = convolution_forward::primitive_desc(convDesc, convAttr, eng);

    // Reorder the weights to the final format, if necessary
    auto weights = weightsPad;
    if (convPrimDesc.weights_desc() != weightsPad->get_desc())
    {
      weights = std::make_shared<memory>(convPrimDesc.weights_desc(), eng);
      ReorderNode(weightsPad, weights).execute(sm);
    }

    // Create convolution node and add it to the net
    auto node = std::make_shared<ConvNode>(convPrimDesc, src, weights, bias, dst);
    nodes.push_back(node);
    return node;
  }

  template<int K>
  memory::dims Network<K>::getPoolDims(const memory::dims& srcDims)
  {
    memory::dims dstDims = srcDims;
    dstDims[2] /= 2; // H/2
    dstDims[3] /= 2; // W/2
    return dstDims;
  }

  template<int K>
  std::shared_ptr<Node> Network<K>::addPool(const std::shared_ptr<memory>& src,
                                            const std::shared_ptr<memory>& userDst)
  {
    const memory::dims kernel  = {2, 2};
    const memory::dims strides = {2, 2};
    const memory::dims padding = {0, 0};

    memory::dims srcDims = getTensorDims(src);
    memory::dims dstDims = getPoolDims(srcDims);

    std::shared_ptr<memory> dst;
    if (!userDst)
      dst = allocTensor(dstDims);
    else if (getTensorDims(userDst) == dstDims)
      dst = userDst;
    else
      dst = castTensor(dstDims, userDst);

    auto poolDesc = pooling_forward::desc(
      prop_kind::forward_inference, pooling_max,
      src->get_desc(),
      dst->get_desc(),
      strides, kernel, padding, padding, padding_kind::zero);

    mkldnn::primitive_attr poolAttr;
    poolAttr.set_scratchpad_mode(scratchpad_mode_user);

    auto poolPrimDesc = pooling_forward::primitive_desc(poolDesc, poolAttr, eng);

    auto node = std::make_shared<PoolNode>(poolPrimDesc, src, dst);
    nodes.push_back(node);
    return node;
  }

  template<int K>
  memory::dims Network<K>::getUpsampleDims(const memory::dims& srcDims)
  {
    memory::dims dstDims = srcDims;
    dstDims[2] *= 2; // H*2
    dstDims[3] *= 2; // W*2
    return dstDims;
  }

  template<int K>
  std::shared_ptr<Node> Network<K>::addUpsample(const std::shared_ptr<memory>& src,
                                                const std::shared_ptr<memory>& userDst)
  {
    memory::dims srcDims = getTensorDims(src);
    memory::dims dstDims = getUpsampleDims(srcDims);

    std::shared_ptr<memory> dst;
    if (!userDst)
      dst = allocTensor(dstDims);
    else if (getTensorDims(userDst) == dstDims)
      dst = userDst;
    else
      dst = castTensor(dstDims, userDst);

    // Create upsampling node and add it to net
    auto node = std::make_shared<UpsampleNode<K>>(src, dst);
    nodes.push_back(node);
    return node;
  }

  template<int K>
  memory::dims Network<K>::getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims)
  {
    assert(src1Dims[0] == src2Dims[0]); // N
    assert(src1Dims[2] == src2Dims[2]); // H
    assert(src1Dims[3] == src2Dims[3]); // W

    memory::dims dstDims = src1Dims;
    dstDims[1] += src2Dims[1]; // C
    return dstDims;
  }

  template<int K>
  std::shared_ptr<Node> Network<K>::addAutoexposure(const Image& color,
                                                    const std::shared_ptr<HDRTransferFunction>& transferFunc)
  {
    auto node = std::make_shared<AutoexposureNode>(color, transferFunc);
    nodes.push_back(node);
    return node;
  }

  template <int K>
  void Network<K>::finalize()
  {
    // Compute the size of the scratchpad
    size_t scratchpadSize = 0;
    for (const auto& node : nodes)
      scratchpadSize = max(scratchpadSize, node->getScratchpadSize());

    // Allocate the scratchpad
    memory::dims scratchpadDims = { memory::dim(scratchpadSize) };
    memory::desc scratchpadDesc(scratchpadDims, memory::data_type::u8, memory::format_tag::x);
    auto scratchpad = std::make_shared<memory>(scratchpadDesc, eng);
    activationAllocBytes += scratchpadSize;
    totalAllocBytes += scratchpadSize;

    // Set the scratchpad for the nodes
    for (auto& node : nodes)
      node->setScratchpad(scratchpad);

    // Free the weights
    weightMap.clear();

    // Print statistics
    if (device->isVerbose(2))
    {
      std::cout << "Activation bytes: " << activationAllocBytes << std::endl;
      std::cout << "Scratchpad bytes: " << scratchpadSize << std::endl;
      std::cout << "Total bytes     : " << totalAllocBytes << std::endl;
    }
  }

  template class Network<8>;
  template class Network<16>;

} // namespace oidn
New lightmapper -Added LocalVector (needed it) -Added stb_rect_pack (It's pretty cool, we could probably use it for other stuff too) -Fixes and changes all around the place -Added library for 128 bits fixed point (required for Delaunay3D) 2020-05-01 14:34:23 +02:00			`// ======================================================================== //`
			`// Copyright 2009-2019 Intel Corporation //`
			`// //`
			`// Licensed under the Apache License, Version 2.0 (the "License"); //`
			`// you may not use this file except in compliance with the License. //`
			`// You may obtain a copy of the License at //`
			`// //`
			`// http://www.apache.org/licenses/LICENSE-2.0 //`
			`// //`
			`// Unless required by applicable law or agreed to in writing, software //`
			`// distributed under the License is distributed on an "AS IS" BASIS, //`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //`
			`// See the License for the specific language governing permissions and //`
			`// limitations under the License. //`
			`// ======================================================================== //`

			`#include "upsample.h"`
			`#include "weights_reorder.h"`
Upgrade OpenImageDenoise to v1.1.0 Upgrade OIDN to 1.1.0, the latest stable version that doesn't need the ISPC compiler. Documented the changes made during the removal of TBB and added a patch file for them. 2020-06-06 19:03:16 +02:00			`#include "network.h"`
			`// -- GODOT start --`
New lightmapper -Added LocalVector (needed it) -Added stb_rect_pack (It's pretty cool, we could probably use it for other stuff too) -Fixes and changes all around the place -Added library for 128 bits fixed point (required for Delaunay3D) 2020-05-01 14:34:23 +02:00			`#include <cstring>`
Upgrade OpenImageDenoise to v1.1.0 Upgrade OIDN to 1.1.0, the latest stable version that doesn't need the ISPC compiler. Documented the changes made during the removal of TBB and added a patch file for them. 2020-06-06 19:03:16 +02:00			`// -- GODOT end --`
New lightmapper -Added LocalVector (needed it) -Added stb_rect_pack (It's pretty cool, we could probably use it for other stuff too) -Fixes and changes all around the place -Added library for 128 bits fixed point (required for Delaunay3D) 2020-05-01 14:34:23 +02:00
			`namespace oidn {`

			`template<int K>`
			`Network<K>::Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap)`
			`: device(device),`
			`eng(engine::cpu, 0),`
			`sm(eng),`
			`weightMap(weightMap)`
			`{`
			`}`

			`template<int K>`
			`void Network<K>::execute(const Progress& progress, int taskIndex)`
			`{`
			`if (progress.func)`
			`{`
			`const double value = double(taskIndex) / double(progress.taskCount);`
			`if (!progress.func(progress.userPtr, value))`
			`throw Exception(Error::Cancelled, "execution was cancelled");`
			`}`

			`for (size_t i = 0; i < nodes.size(); ++i)`
			`{`
			`nodes[i]->execute(sm);`

			`if (progress.func)`
			`{`
			`const double value = (double(taskIndex) + double(i+1) / double(nodes.size())) / double(progress.taskCount);`
			`if (!progress.func(progress.userPtr, value))`
			`throw Exception(Error::Cancelled, "execution was cancelled");`
			`}`
			`}`
			`}`

			`template<int K>`
			`std::shared_ptr<memory> Network<K>::allocTensor(const memory::dims& dims,`
			`memory::format_tag format,`
			`void* data)`
			`{`
			`if (format == memory::format_tag::any)`
			`{`
			`if (dims.size() == 4)`
			`format = BlockedFormat<K>::nChwKc;`
			`else if (dims.size() == 1)`
			`format = memory::format_tag::x;`
			`else`
			`assert(0);`
			`}`
			`memory::desc desc(dims, memory::data_type::f32, format);`
			`if (data == nullptr)`
			`{`
			`const size_t bytes = getTensorSize(dims) * sizeof(float);`
			`if (format == BlockedFormat<K>::nChwKc)`
			`activationAllocBytes += bytes;`
			`totalAllocBytes += bytes;`

			`return std::make_shared<memory>(desc, eng);`
			`}`
			`else`
			`{`
			`return std::make_shared<memory>(desc, eng, data);`
			`}`
			`}`

			`template<int K>`
			`std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,`
			`const std::shared_ptr<memory>& src,`
			`size_t srcOffset,`
			`memory::format_tag format)`
			`{`
			`const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;`
			`MAYBE_UNUSED(srcDesc);`
			`assert(srcDesc.data_type == memory::data_type::f32);`
			`assert(getTensorSize(src) >= srcOffset + getTensorSize(dims));`

			`if (format == memory::format_tag::any)`
			`{`
			`if (dims.size() == 4)`
			`format = BlockedFormat<K>::nChwKc;`
			`else if (dims.size() == 1)`
			`format = memory::format_tag::x;`
			`else`
			`assert(0);`
			`}`
			`memory::desc desc(dims, memory::data_type::f32, format);`
			`float* srcPtr = (float*)src->get_data_handle() + srcOffset;`
			`return std::make_shared<memory>(desc, eng, srcPtr);`
			`}`

			`template<int K>`
			`std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,`
			`const std::shared_ptr<memory>& src,`
			`const memory::dims& srcOffset)`
			`{`
			`return castTensor(dims, src, getTensorSize(srcOffset));`
			`}`

			`template<int K>`
			`void Network<K>::zeroTensor(const std::shared_ptr<memory>& dst)`
			`{`
			`assert(getTensorType(dst) == memory::data_type::f32);`
			`memset(dst->get_data_handle(), 0, getTensorSize(dst)*sizeof(float));`
			`}`

			`template<int K>`
			`memory::dims Network<K>::getInputReorderDims(const memory::dims& srcDims, int alignment)`
			`{`
			`memory::dims dstDims = srcDims;`
			`dstDims[1] = getPadded<K>(srcDims[1]); // round up C`
			`dstDims[2] = roundUp(srcDims[2], memory::dim(alignment)); // round up H`
			`dstDims[3] = roundUp(srcDims[3], memory::dim(alignment)); // round up W`
			`return dstDims;`
			`}`

			`template<int K>`
			`std::shared_ptr<Node> Network<K>::addInputReorder(const Image& color,`
			`const Image& albedo,`
			`const Image& normal,`
			`const std::shared_ptr<TransferFunction>& transferFunc,`
			`int alignment,`
			`const std::shared_ptr<memory>& userDst)`
			`{`
			`assert(color);`
			`int inputC = 3;`
			`if (albedo) inputC += 3;`
			`if (normal) inputC += 3;`

			`memory::dims srcDims = {1, inputC, color.height, color.width};`
			`memory::dims dstDims = getInputReorderDims(srcDims, alignment);`

			`// Allocate padded memory`
			`auto dst = userDst;`
			`if (!dst)`
			`dst = allocTensor(dstDims);`

			`// Push node`
			`std::shared_ptr<Node> node;`

			`if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))`
			`node = std::make_shared<InputReorderNode<K, LinearTransferFunction>>(color, albedo, normal, dst, tf);`
			`else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))`
			`node = std::make_shared<InputReorderNode<K, GammaTransferFunction>>(color, albedo, normal, dst, tf);`
			`else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))`
			`node = std::make_shared<InputReorderNode<K, LogTransferFunction>>(color, albedo, normal, dst, tf);`
			`else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))`
			`node = std::make_shared<InputReorderNode<K, PQXTransferFunction>>(color, albedo, normal, dst, tf);`
			`else`
			`assert(0);`

			`nodes.push_back(node);`
			`return node;`
			`}`

			`template<int K>`
			`std::shared_ptr<Node> Network<K>::addOutputReorder(const std::shared_ptr<memory>& src,`
			`const std::shared_ptr<TransferFunction>& transferFunc,`
			`const Image& output)`
			`{`
			`memory::dims srcDims = getTensorDims(src);`
			`assert(srcDims[1] == K);`

			`// Push node`
			`std::shared_ptr<Node> node;`

			`if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))`
			`node = std::make_shared<OutputReorderNode<K, LinearTransferFunction>>(src, output, tf);`
			`else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))`
			`node = std::make_shared<OutputReorderNode<K, GammaTransferFunction>>(src, output, tf);`
			`else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))`
			`node = std::make_shared<OutputReorderNode<K, LogTransferFunction>>(src, output, tf);`
			`else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))`
			`node = std::make_shared<OutputReorderNode<K, PQXTransferFunction>>(src, output, tf);`
			`else`
			`assert(0);`

			`nodes.push_back(node);`
			`return node;`
			`}`

			`template<int K>`
			`memory::dims Network<K>::getConvDims(const std::string& name, const memory::dims& srcDims)`
			`{`
			`auto b = weightMap[name + "/b"];`
			`memory::dims dstDims = srcDims;`
			`dstDims[1] = getPadded<K>(b.dims[0]); // dstDims[C] = getPadded(OC)`
			`return dstDims;`
			`}`

			`template<int K>`
			`std::shared_ptr<Node> Network<K>::addConv(const std::string& name,`
			`const std::shared_ptr<memory>& src,`
			`const std::shared_ptr<memory>& userDst,`
			`bool relu)`
			`{`
			`const memory::dims strides = {1, 1};`
			`const memory::dims padding = {1, 1};`

			`memory::dims srcDims = getTensorDims(src);`

			`// Get the weights`
			`const auto& W = weightMap[name + "/W"];`
			`if (W.ndims() != 4 \|\| W.format != "oihw")`
			`throw Exception(Error::InvalidOperation, "invalid convolution weights");`
			`memory::dims weightsDims = W.dims;`
			`auto userWeights = allocTensor(weightsDims, memory::format_tag::oihw, W.data);`

			`// Pad the weights`
			`memory::dims weightsPadDims = weightsDims;`
			`weightsPadDims[1] = getPadded<K>(weightsDims[1]); // IC`
			`weightsPadDims[0] = getPadded<K>(weightsDims[0]); // OC`
			`assert(srcDims[1] == weightsPadDims[1]); // srcDims[C] == weightsPadDims[IC]`
			`auto weightsPad = allocTensor(weightsPadDims, memory::format_tag::oihw);`
			`WeightsReorderNode<K>(userWeights, weightsPad).execute(sm);`

			`// Get the biases`
			`const auto& b = weightMap[name + "/b"];`
			`if (b.ndims() != 1)`
			`throw Exception(Error::InvalidOperation, "invalid convolution biases");`
			`memory::dims biasDims = b.dims;`

			`// Copy/pad the biases`
			`memory::dims biasPadDims = {getPadded<K>(biasDims[0])};`
			`auto bias = allocTensor(biasPadDims);`
			`if (biasDims[0] != biasPadDims[0])`
			`memset(bias->get_data_handle(), 0, biasPadDims[0]*sizeof(float));`
			`memcpy(bias->get_data_handle(), b.data, biasDims[0]*sizeof(float));`

			`// Allocate memory for destination`
			`memory::dims dstDims = srcDims;`
			`dstDims[1] = weightsPadDims[0]; // dstDims[C] = weightsPadDims[OC]`

			`std::shared_ptr<memory> dst;`
			`if (!userDst)`
			`dst = allocTensor(dstDims);`
			`else if (getTensorDims(userDst) == dstDims)`
			`dst = userDst;`
			`else`
			`dst = castTensor(dstDims, userDst);`

			`// Create a convolution`
			`// Let the convolution primitive choose the weights format`
			`auto weightsDesc = memory::desc({ weightsPadDims }, memory::data_type::f32, memory::format_tag::any);`

			`auto convAlgo = (K == 16) ? convolution_winograd : convolution_direct;`
			`auto convDesc = convolution_forward::desc(`
			`prop_kind::forward_inference, convAlgo,`
			`src->get_desc(),`
			`weightsDesc,`
			`bias->get_desc(),`
			`dst->get_desc(),`
			`strides, padding, padding, padding_kind::zero);`

			`// Incorporate relu`
			`mkldnn::primitive_attr convAttr;`
			`if (relu)`
			`{`
			`mkldnn::post_ops ops;`
			`ops.append_eltwise(`
			`1.f, // scale factor, not used`
			`algorithm::eltwise_relu,`
			`0.f, // max with`
			`0.f // unused`
			`);`
			`convAttr.set_post_ops(ops);`
			`}`
			`convAttr.set_scratchpad_mode(scratchpad_mode_user);`

			`auto convPrimDesc = convolution_forward::primitive_desc(convDesc, convAttr, eng);`

			`// Reorder the weights to the final format, if necessary`
			`auto weights = weightsPad;`
			`if (convPrimDesc.weights_desc() != weightsPad->get_desc())`
			`{`
			`weights = std::make_shared<memory>(convPrimDesc.weights_desc(), eng);`
			`ReorderNode(weightsPad, weights).execute(sm);`
			`}`

			`// Create convolution node and add it to the net`
			`auto node = std::make_shared<ConvNode>(convPrimDesc, src, weights, bias, dst);`
			`nodes.push_back(node);`
			`return node;`
			`}`

			`template<int K>`
			`memory::dims Network<K>::getPoolDims(const memory::dims& srcDims)`
			`{`
			`memory::dims dstDims = srcDims;`
			`dstDims[2] /= 2; // H/2`
			`dstDims[3] /= 2; // W/2`
			`return dstDims;`
			`}`

			`template<int K>`
			`std::shared_ptr<Node> Network<K>::addPool(const std::shared_ptr<memory>& src,`
			`const std::shared_ptr<memory>& userDst)`
			`{`
			`const memory::dims kernel = {2, 2};`
			`const memory::dims strides = {2, 2};`
			`const memory::dims padding = {0, 0};`

			`memory::dims srcDims = getTensorDims(src);`
			`memory::dims dstDims = getPoolDims(srcDims);`

			`std::shared_ptr<memory> dst;`
			`if (!userDst)`
			`dst = allocTensor(dstDims);`
			`else if (getTensorDims(userDst) == dstDims)`
			`dst = userDst;`
			`else`
			`dst = castTensor(dstDims, userDst);`

			`auto poolDesc = pooling_forward::desc(`
			`prop_kind::forward_inference, pooling_max,`
			`src->get_desc(),`
			`dst->get_desc(),`
			`strides, kernel, padding, padding, padding_kind::zero);`

			`mkldnn::primitive_attr poolAttr;`
			`poolAttr.set_scratchpad_mode(scratchpad_mode_user);`

			`auto poolPrimDesc = pooling_forward::primitive_desc(poolDesc, poolAttr, eng);`

			`auto node = std::make_shared<PoolNode>(poolPrimDesc, src, dst);`
			`nodes.push_back(node);`
			`return node;`
			`}`

			`template<int K>`
			`memory::dims Network<K>::getUpsampleDims(const memory::dims& srcDims)`
			`{`
			`memory::dims dstDims = srcDims;`
			`dstDims[2] = 2; // H2`
			`dstDims[3] = 2; // W2`
			`return dstDims;`
			`}`

			`template<int K>`
			`std::shared_ptr<Node> Network<K>::addUpsample(const std::shared_ptr<memory>& src,`
			`const std::shared_ptr<memory>& userDst)`
			`{`
			`memory::dims srcDims = getTensorDims(src);`
			`memory::dims dstDims = getUpsampleDims(srcDims);`

			`std::shared_ptr<memory> dst;`
			`if (!userDst)`
			`dst = allocTensor(dstDims);`
			`else if (getTensorDims(userDst) == dstDims)`
			`dst = userDst;`
			`else`
			`dst = castTensor(dstDims, userDst);`

			`// Create upsampling node and add it to net`
			`auto node = std::make_shared<UpsampleNode<K>>(src, dst);`
			`nodes.push_back(node);`
			`return node;`
			`}`

			`template<int K>`
			`memory::dims Network<K>::getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims)`
			`{`
			`assert(src1Dims[0] == src2Dims[0]); // N`
			`assert(src1Dims[2] == src2Dims[2]); // H`
			`assert(src1Dims[3] == src2Dims[3]); // W`

			`memory::dims dstDims = src1Dims;`
			`dstDims[1] += src2Dims[1]; // C`
			`return dstDims;`
			`}`

			`template<int K>`
			`std::shared_ptr<Node> Network<K>::addAutoexposure(const Image& color,`
			`const std::shared_ptr<HDRTransferFunction>& transferFunc)`
			`{`
			`auto node = std::make_shared<AutoexposureNode>(color, transferFunc);`
			`nodes.push_back(node);`
			`return node;`
			`}`

			`template <int K>`
			`void Network<K>::finalize()`
			`{`
			`// Compute the size of the scratchpad`
			`size_t scratchpadSize = 0;`
			`for (const auto& node : nodes)`
			`scratchpadSize = max(scratchpadSize, node->getScratchpadSize());`

			`// Allocate the scratchpad`
			`memory::dims scratchpadDims = { memory::dim(scratchpadSize) };`
			`memory::desc scratchpadDesc(scratchpadDims, memory::data_type::u8, memory::format_tag::x);`
			`auto scratchpad = std::make_shared<memory>(scratchpadDesc, eng);`
			`activationAllocBytes += scratchpadSize;`
			`totalAllocBytes += scratchpadSize;`

			`// Set the scratchpad for the nodes`
			`for (auto& node : nodes)`
			`node->setScratchpad(scratchpad);`

			`// Free the weights`
			`weightMap.clear();`

			`// Print statistics`
			`if (device->isVerbose(2))`
			`{`
			`std::cout << "Activation bytes: " << activationAllocBytes << std::endl;`
			`std::cout << "Scratchpad bytes: " << scratchpadSize << std::endl;`
			`std::cout << "Total bytes : " << totalAllocBytes << std::endl;`
			`}`
			`}`

			`template class Network<8>;`
			`template class Network<16>;`

			`} // namespace oidn`