godot/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
JFonS 1cfed0d583
Switch to embree-aarch64
(cherry picked from commit 73e2ccd603)
2021-05-05 15:11:13 +02:00

216 lines
9.8 KiB
C++

// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "node_intersector.h"
namespace embree
{
namespace isa
{
//////////////////////////////////////////////////////////////////////////////////////
// Ray packet structure used in stream traversal
//////////////////////////////////////////////////////////////////////////////////////
template<int K, bool robust>
struct TravRayKStream;
/* Fast variant */
template<int K>
struct TravRayKStream<K, false>
{
__forceinline TravRayKStream() {}
__forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
{
init(ray_org, ray_dir);
tnear = ray_tnear;
tfar = ray_tfar;
}
__forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
{
rdir = rcp_safe(ray_dir);
#if defined(__aarch64__)
neg_org_rdir = -(ray_org * rdir);
#else
org_rdir = ray_org * rdir;
#endif
}
Vec3vf<K> rdir;
#if defined(__aarch64__)
Vec3vf<K> neg_org_rdir;
#else
Vec3vf<K> org_rdir;
#endif
vfloat<K> tnear;
vfloat<K> tfar;
};
template<int K>
using TravRayKStreamFast = TravRayKStream<K, false>;
/* Robust variant */
template<int K>
struct TravRayKStream<K, true>
{
__forceinline TravRayKStream() {}
__forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
{
init(ray_org, ray_dir);
tnear = ray_tnear;
tfar = ray_tfar;
}
__forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
{
rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
org = ray_org;
}
Vec3vf<K> rdir;
Vec3vf<K> org;
vfloat<K> tnear;
vfloat<K> tfar;
};
template<int K>
using TravRayKStreamRobust = TravRayKStream<K, true>;
//////////////////////////////////////////////////////////////////////////////////////
// Fast AABBNode intersection
//////////////////////////////////////////////////////////////////////////////////////
template<int N, int Nx, int K>
__forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf)
{
const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
#if defined (__aarch64__)
const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
#else
const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
#endif
const vfloat<Nx> rmin = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
const vfloat<Nx> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
const vbool<Nx> vmask_first_hit = rmin <= rmax;
return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
}
template<int N, int K>
__forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf)
{
char* ptr = (char*)&node->lower_x + i*sizeof(float);
const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
#if defined (__aarch64__)
const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
#else
const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
#endif
const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear);
const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
const vbool<K> vmask_first_hit = rmin <= rmax;
return movemask(vmask_first_hit);
}
//////////////////////////////////////////////////////////////////////////////////////
// Robust AABBNode intersection
//////////////////////////////////////////////////////////////////////////////////////
template<int N, int Nx, int K>
__forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf)
{
const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
const vfloat<Nx> rminX = (bminX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
const vfloat<Nx> rminY = (bminY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
const vfloat<Nx> rminZ = (bminZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
const vfloat<Nx> rmaxX = (bmaxX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
const vfloat<Nx> rmaxY = (bmaxY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
const vfloat<Nx> rmaxZ = (bmaxZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
const vfloat<Nx> rmin = max(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
const vfloat<Nx> rmax = round_up *min(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
const vbool<Nx> vmask_first_hit = rmin <= rmax;
return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
}
template<int N, int K>
__forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf)
{
char *ptr = (char*)&node->lower_x + i*sizeof(float);
const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x;
const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y;
const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z;
const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x;
const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y;
const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z;
const float round_up = 1.0f+3.0f*float(ulp);
const vfloat<K> rmin = max(rminX, rminY, rminZ, vfloat<K>(ray.tnear));
const vfloat<K> rmax = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar));
const vbool<K> vmask_first_hit = rmin <= rmax;
return movemask(vmask_first_hit);
}
}
}