From 595cbacdf1c4fe9a75f55d9e5b78e49765b28c86 Mon Sep 17 00:00:00 2001 From: Joan Fons Date: Mon, 13 Sep 2021 15:05:21 +0200 Subject: [PATCH] Upgrade Embree and enable ray packets Minor patch upgrade. Enabling ray packets results in faster processing of ray streams (i.e. occlusion culling buffer updates) at the cost of slightly larger binary sizes. --- modules/raycast/SCsub | 3 + modules/raycast/godot_update_embree.py | 12 +- .../embree/include/embree3/rtcore_config.h | 6 +- .../kernels/bvh/bvh_intersector_hybrid.cpp | 917 ++++++++++++++++++ .../bvh/bvh_intersector_hybrid4_bvh4.cpp | 59 ++ .../kernels/bvh/bvh_intersector_stream.cpp | 528 ++++++++++ .../bvh/bvh_intersector_stream_bvh4.cpp | 36 + .../bvh/bvh_intersector_stream_filters.cpp | 657 +++++++++++++ thirdparty/embree/kernels/config.h | 2 +- thirdparty/embree/kernels/hash.h | 2 +- 10 files changed, 2216 insertions(+), 6 deletions(-) create mode 100644 thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp create mode 100644 thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp create mode 100644 thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp create mode 100644 thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp create mode 100644 thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub index 6e7b3e7b8d..1fdc8fe1b3 100644 --- a/modules/raycast/SCsub +++ b/modules/raycast/SCsub @@ -55,6 +55,9 @@ if env["builtin_embree"]: "kernels/bvh/bvh_builder_sah_mb.cpp", "kernels/bvh/bvh_builder_twolevel.cpp", "kernels/bvh/bvh_intersector1_bvh4.cpp", + "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_filters.cpp", ] thirdparty_sources = [thirdparty_dir + file for file in embree_src] diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py index 31a25a318f..e31d88b741 100644 --- a/modules/raycast/godot_update_embree.py +++ b/modules/raycast/godot_update_embree.py @@ -61,6 +61,11 @@ cpp_files = [ "kernels/bvh/bvh_builder_twolevel.cpp", "kernels/bvh/bvh_intersector1.cpp", "kernels/bvh/bvh_intersector1_bvh4.cpp", + "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_filters.cpp", + "kernels/bvh/bvh_intersector_hybrid.cpp", + "kernels/bvh/bvh_intersector_stream.cpp", ] os.chdir("../../thirdparty") @@ -117,7 +122,7 @@ with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file: /* #undef EMBREE_GEOMETRY_INSTANCE */ /* #undef EMBREE_GEOMETRY_GRID */ /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 @@ -249,3 +254,8 @@ with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as con os.chdir("..") shutil.rmtree("embree-tmp") + +subprocess.run(["git", "restore", "embree/patches"]) + +for patch in os.listdir("embree/patches"): + subprocess.run(["git", "apply", "embree/patches/" + patch]) diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h index 3a9819c9f1..62b7b6f4dc 100644 --- a/thirdparty/embree/include/embree3/rtcore_config.h +++ b/thirdparty/embree/include/embree3/rtcore_config.h @@ -6,9 +6,9 @@ #define RTC_VERSION_MAJOR 3 #define RTC_VERSION_MINOR 13 -#define RTC_VERSION_PATCH 0 -#define RTC_VERSION 31300 -#define RTC_VERSION_STRING "3.13.0" +#define RTC_VERSION_PATCH 1 +#define RTC_VERSION 31301 +#define RTC_VERSION_STRING "3.13.1" #define RTC_MAX_INSTANCE_LEVEL_COUNT 1 diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp new file mode 100644 index 0000000000..6e9a5a538e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp @@ -0,0 +1,917 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.h" +#include "bvh_traverser1.h" +#include "node_intersector1.h" +#include "node_intersector_packet.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/curveNv_intersector.h" +#include "../geometry/curveNi_intersector.h" +#include "../geometry/curveNi_mb_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" +#include "../geometry/subgrid_intersector.h" +#include "../geometry/subgrid_mb_intersector.h" +#include "../geometry/curve_intersector_virtual.h" + +#define SWITCH_DURING_DOWN_TRAVERSAL 1 +#define FORCE_SINGLE_MODE 0 + +#define ENABLE_FAST_COHERENT_CODEPATHS 1 + +namespace embree +{ + namespace isa + { + template + void BVHNIntersectorKHybrid::intersect1(Accel::Intersectors* This, + const BVH* bvh, + NodeRef root, + size_t k, + Precalculations& pre, + RayHitK& ray, + const TravRayK& tray, + IntersectContext* context) + { + /* stack state */ + StackItemT stack[stackSizeSingle]; // stack of nodes + StackItemT* stackPtr = stack + 1; // current stack pointer + StackItemT* stackEnd = stack + stackSizeSingle; + stack[0].ptr = root; + stack[0].dist = neg_inf; + + /* load the ray into SIMD registers */ + TravRay tray1; + tray1.template init(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ + if (unlikely(*(float*)&stackPtr->dist > ray.tfar[k])) + continue; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat tNear; + STAT3(normal.trav_nodes, 1, 1, 1); + bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray1, ray.time()[k], tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + BVHNNodeTraverser1Hit::traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(This, pre, ray, k, context, prim, num, tray1, lazy_node); + + tray1.tfar = ray.tfar[k]; + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + } + + template + void BVHNIntersectorKHybrid::intersect(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayHitK& __restrict__ ray, + IntersectContext* __restrict__ context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 + assert(context); + if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) + { + intersectCoherent(valid_i, This, ray, context); + return; + } +#endif + + /* filter out invalid rays */ + vbool valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(movemask(valid))], 1, 1, 1); +#endif + + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + if (single) + { + tray.tnear = select(valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(valid, org_ray_tfar , vfloat(neg_inf)); + + for (; valid_bits!=0; ) { + const size_t i = bscf(valid_bits); + intersect1(This, bvh, bvh->root, i, pre, ray, tray, context); + } + return; + } + + /* determine switch threshold based on flags */ + const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + + vint octant = ray.octant(); + octant = select(valid, octant, vint(0xffffffff)); + + /* test whether we have ray with opposing direction signs in the packet */ + bool split = false; + { + size_t bits = valid_bits; + vbool vsplit( false ); + do + { + const size_t valid_index = bsf(bits); + vbool octant_valid = octant[valid_index] == octant; + bits &= ~(size_t)movemask(octant_valid); + vsplit |= vint(octant[valid_index]) == (octant^vint(0x7)); + } while (bits); + if (any(vsplit)) split = true; + } + + do + { + const size_t valid_index = bsf(valid_bits); + const vint diff_octant = vint(octant[valid_index])^octant; + const vint count_diff_octant = \ + ((diff_octant >> 2) & 1) + + ((diff_octant >> 1) & 1) + + ((diff_octant >> 0) & 1); + + vbool octant_valid = (count_diff_octant <= 1) & (octant != vint(0xffffffff)); + if (!single || !split) octant_valid = valid; // deactivate octant sorting in pure chunk mode, otherwise instance traversal performance goes down + + + octant = select(octant_valid,vint(0xffffffff),octant); + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar , vfloat(neg_inf)); + + /* allocate stack and push root node */ + vfloat stack_near[stackSizeChunk]; + NodeRef stack_node[stackSizeChunk]; + stack_node[0] = BVH::invalidNode; + stack_near[0] = inf; + stack_node[1] = bvh->root; + stack_near[1] = tray.tnear; + NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; + NodeRef* __restrict__ sptr_node = stack_node + 2; + vfloat* __restrict__ sptr_near = stack_near + 2; + + while (1) pop: + { + /* pop next node from stack */ + assert(sptr_node > stack_node); + sptr_node--; + sptr_near--; + NodeRef cur = *sptr_node; + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* cull node if behind closest hit point */ + vfloat curDist = *sptr_near; + const vbool active = curDist < tray.tfar; + if (unlikely(none(active))) + continue; + + /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 + if (single) +#endif + { + size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 + if (unlikely(popcnt(bits) <= switchThreshold)) +#endif + { + for (; bits!=0; ) { + const size_t i = bscf(bits); + intersect1(This, bvh, cur, i, pre, ray, tray, context); + } + tray.tfar = min(tray.tfar, ray.tfar); + continue; + } + } +#endif + while (likely(!cur.isLeaf())) + { + /* process nodes */ + const vbool valid_node = tray.tfar > curDist; + STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const BaseNode* __restrict__ const node = nodeRef.baseNode(); + + /* set cur to invalid */ + cur = BVH::emptyNode; + curDist = pos_inf; + + size_t num_child_hits = 0; + + for (unsigned i = 0; i < N; i++) + { + const NodeRef child = node->children[i]; + if (unlikely(child == BVH::emptyNode)) break; + vfloat lnearP; + vbool lhit = valid_node; + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + /* if we hit the child we choose to continue with that child if it + is closer than the current next child, or we push it onto the stack */ + if (likely(any(lhit))) + { + assert(sptr_node < stackEnd); + assert(child != BVH::emptyNode); + const vfloat childDist = select(lhit, lnearP, inf); + /* push cur node onto stack and continue with hit child */ + if (any(childDist < curDist)) + { + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + *sptr_node = cur; sptr_node++; + *sptr_near = curDist; sptr_near++; + } + curDist = childDist; + cur = child; + } + + /* push hit child onto stack */ + else { + num_child_hits++; + *sptr_node = child; sptr_node++; + *sptr_near = childDist; sptr_near++; + } + } + } + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[num_child_hits], 1, 1, 1); +#endif + + if (unlikely(cur == BVH::emptyNode)) + goto pop; + + /* improved distance sorting for 3 or more hits */ + if (unlikely(num_child_hits >= 2)) + { + if (any(sptr_near[-2] < sptr_near[-1])) + { + std::swap(sptr_near[-2],sptr_near[-1]); + std::swap(sptr_node[-2],sptr_node[-1]); + } + if (unlikely(num_child_hits >= 3)) + { + if (any(sptr_near[-3] < sptr_near[-1])) + { + std::swap(sptr_near[-3],sptr_near[-1]); + std::swap(sptr_node[-3],sptr_node[-1]); + } + if (any(sptr_near[-3] < sptr_near[-2])) + { + std::swap(sptr_near[-3],sptr_near[-2]); + std::swap(sptr_node[-3],sptr_node[-2]); + } + } + } + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 + if (single) + { + // seems to be the best place for testing utilization + if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) + { + *sptr_node++ = cur; + *sptr_near++ = curDist; + goto pop; + } + } +#endif + } + + /* return if stack is empty */ + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* intersect leaf */ + assert(cur != BVH::emptyNode); + const vbool valid_leaf = tray.tfar > curDist; + STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); + tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); + + if (unlikely(lazy_node)) { + *sptr_node = lazy_node; sptr_node++; + *sptr_near = neg_inf; sptr_near++; + } + } + } while(valid_bits); + } + + + template + void BVHNIntersectorKHybrid::intersectCoherent(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayHitK& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* filter out invalid rays */ + vbool valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + vint octant = ray.octant(); + octant = select(valid, octant, vint(0xffffffff)); + + do + { + const size_t valid_index = bsf(valid_bits); + const vbool octant_valid = octant[valid_index] == octant; + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar , vfloat(neg_inf)); + + Frustum frustum; + frustum.template init(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + + StackItemT stack[stackSizeSingle]; // stack of nodes + StackItemT* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + while (1) pop: + { + /* pop next node from stack */ + if (unlikely(stackPtr == stack)) break; + + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* cull node if behind closest hit point */ + vfloat curDist = *(float*)&stackPtr->dist; + const vbool active = curDist < tray.tfar; + if (unlikely(none(active))) continue; + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + + vfloat fmin; + size_t m_frustum_node = intersectNodeFrustum(node, frustum, fmin); + + if (unlikely(!m_frustum_node)) goto pop; + cur = BVH::emptyNode; + curDist = pos_inf; + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif + size_t num_child_hits = 0; + do { + const size_t i = bscf(m_frustum_node); + vfloat lnearP; + vbool lhit = false; // motion blur is not supported, so the initial value will be ignored + STAT3(normal.trav_nodes, 1, 1, 1); + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + if (likely(any(lhit))) + { + const vfloat childDist = fmin[i]; + const NodeRef child = node->child(i); + BVHN::prefetch(child); + if (any(childDist < curDist)) + { + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + stackPtr->ptr = cur; + *(float*)&stackPtr->dist = toScalar(curDist); + stackPtr++; + } + curDist = childDist; + cur = child; + } + /* push hit child onto stack */ + else { + num_child_hits++; + stackPtr->ptr = child; + *(float*)&stackPtr->dist = toScalar(childDist); + stackPtr++; + } + } + } while(m_frustum_node); + + if (unlikely(cur == BVH::emptyNode)) goto pop; + + /* improved distance sorting for 3 or more hits */ + if (unlikely(num_child_hits >= 2)) + { + if (stackPtr[-2].dist < stackPtr[-1].dist) + std::swap(stackPtr[-2],stackPtr[-1]); + if (unlikely(num_child_hits >= 3)) + { + if (stackPtr[-3].dist < stackPtr[-1].dist) + std::swap(stackPtr[-3],stackPtr[-1]); + if (stackPtr[-3].dist < stackPtr[-2].dist) + std::swap(stackPtr[-3],stackPtr[-2]); + } + } + } + + /* intersect leaf */ + assert(cur != BVH::invalidNode); + assert(cur != BVH::emptyNode); + const vbool valid_leaf = tray.tfar > curDist; + STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); + + /* reduce max distance interval on successful intersection */ + if (likely(any((ray.tfar < tray.tfar) & valid_leaf))) + { + tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); + frustum.template updateMaxDist(tray.tfar); + } + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + + } while(valid_bits); + } + + // =================================================================================================================================================================== + // =================================================================================================================================================================== + // =================================================================================================================================================================== + + template + bool BVHNIntersectorKHybrid::occluded1(Accel::Intersectors* This, + const BVH* bvh, + NodeRef root, + size_t k, + Precalculations& pre, + RayK& ray, + const TravRayK& tray, + IntersectContext* context) + { + /* stack state */ + NodeRef stack[stackSizeSingle]; // stack of nodes that still need to get traversed + NodeRef* stackPtr = stack+1; // current stack pointer + NodeRef* stackEnd = stack+stackSizeSingle; + stack[0] = root; + + /* load the ray into SIMD registers */ + TravRay tray1; + tray1.template init(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = (NodeRef)*stackPtr; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat tNear; + STAT3(shadow.trav_nodes, 1, 1, 1); + bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray1, ray.time()[k], tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + BVHNNodeTraverser1Hit::traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves, 1, 1, 1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + + size_t lazy_node = 0; + if (PrimitiveIntersectorK::occluded(This, pre, ray, k, context, prim, num, tray1, lazy_node)) { + ray.tfar[k] = neg_inf; + return true; + } + + if (unlikely(lazy_node)) { + *stackPtr = lazy_node; + stackPtr++; + } + } + return false; + } + + template + void BVHNIntersectorKHybrid::occluded(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayK& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 + assert(context); + if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) + { + occludedCoherent(valid_i, This, ray, context); + return; + } +#endif + + /* filter out already occluded and invalid rays */ + vbool valid = (*valid_i == -1) & (ray.tfar >= 0.0f); +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + const size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + tray.tnear = select(valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(valid, org_ray_tfar , vfloat(neg_inf)); + + vbool terminated = !valid; + const vfloat inf = vfloat(pos_inf); + + /* determine switch threshold based on flags */ + const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + + /* allocate stack and push root node */ + vfloat stack_near[stackSizeChunk]; + NodeRef stack_node[stackSizeChunk]; + stack_node[0] = BVH::invalidNode; + stack_near[0] = inf; + stack_node[1] = bvh->root; + stack_near[1] = tray.tnear; + NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; + NodeRef* __restrict__ sptr_node = stack_node + 2; + vfloat* __restrict__ sptr_near = stack_near + 2; + + while (1) pop: + { + /* pop next node from stack */ + assert(sptr_node > stack_node); + sptr_node--; + sptr_near--; + NodeRef cur = *sptr_node; + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* cull node if behind closest hit point */ + vfloat curDist = *sptr_near; + const vbool active = curDist < tray.tfar; + if (unlikely(none(active))) + continue; + + /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 + if (single) +#endif + { + size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 + if (unlikely(popcnt(bits) <= switchThreshold)) +#endif + { + for (; bits!=0; ) { + const size_t i = bscf(bits); + if (occluded1(This, bvh, cur, i, pre, ray, tray, context)) + set(terminated, i); + } + if (all(terminated)) break; + tray.tfar = select(terminated, vfloat(neg_inf), tray.tfar); + continue; + } + } +#endif + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + const vbool valid_node = tray.tfar > curDist; + STAT3(shadow.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const BaseNode* __restrict__ const node = nodeRef.baseNode(); + + /* set cur to invalid */ + cur = BVH::emptyNode; + curDist = pos_inf; + + for (unsigned i = 0; i < N; i++) + { + const NodeRef child = node->children[i]; + if (unlikely(child == BVH::emptyNode)) break; + vfloat lnearP; + vbool lhit = valid_node; + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + /* if we hit the child we push the previously hit node onto the stack, and continue with the currently hit child */ + if (likely(any(lhit))) + { + assert(sptr_node < stackEnd); + assert(child != BVH::emptyNode); + const vfloat childDist = select(lhit, lnearP, inf); + + /* push 'cur' node onto stack and continue with hit child */ + if (likely(cur != BVH::emptyNode)) { + *sptr_node = cur; sptr_node++; + *sptr_near = curDist; sptr_near++; + } + curDist = childDist; + cur = child; + } + } + if (unlikely(cur == BVH::emptyNode)) + goto pop; + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 + if (single) + { + // seems to be the best place for testing utilization + if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) + { + *sptr_node++ = cur; + *sptr_near++ = curDist; + goto pop; + } + } +#endif + } + + /* return if stack is empty */ + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + + /* intersect leaf */ + assert(cur != BVH::emptyNode); + const vbool valid_leaf = tray.tfar > curDist; + STAT3(shadow.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*) cur.leaf(items); + + size_t lazy_node = 0; + terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); + if (all(terminated)) break; + tray.tfar = select(terminated, vfloat(neg_inf), tray.tfar); // ignore node intersections for terminated rays + + if (unlikely(lazy_node)) { + *sptr_node = lazy_node; sptr_node++; + *sptr_near = neg_inf; sptr_near++; + } + } + + vfloat::store(valid & terminated, &ray.tfar, neg_inf); + } + + + template + void BVHNIntersectorKHybrid::occludedCoherent(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayK& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* filter out invalid rays */ + vbool valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid,ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + vbool terminated = !valid; + + vint octant = ray.octant(); + octant = select(valid, octant, vint(0xffffffff)); + + do + { + const size_t valid_index = bsf(valid_bits); + vbool octant_valid = octant[valid_index] == octant; + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar, vfloat(neg_inf)); + + Frustum frustum; + frustum.template init(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + + StackItemMaskT stack[stackSizeSingle]; // stack of nodes + StackItemMaskT* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].mask = movemask(octant_valid); + + while (1) pop: + { + /* pop next node from stack */ + if (unlikely(stackPtr == stack)) break; + + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* cull node of active rays have already been terminated */ + size_t m_active = (size_t)stackPtr->mask & (~(size_t)movemask(terminated)); + + if (unlikely(m_active == 0)) continue; + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + + vfloat fmin; + size_t m_frustum_node = intersectNodeFrustum(node, frustum, fmin); + + if (unlikely(!m_frustum_node)) goto pop; + cur = BVH::emptyNode; + m_active = 0; + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif + size_t num_child_hits = 0; + do { + const size_t i = bscf(m_frustum_node); + vfloat lnearP; + vbool lhit = false; // motion blur is not supported, so the initial value will be ignored + STAT3(normal.trav_nodes, 1, 1, 1); + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + if (likely(any(lhit))) + { + const NodeRef child = node->child(i); + assert(child != BVH::emptyNode); + BVHN::prefetch(child); + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + stackPtr->ptr = cur; + stackPtr->mask = m_active; + stackPtr++; + } + cur = child; + m_active = movemask(lhit); + } + } while(m_frustum_node); + + if (unlikely(cur == BVH::emptyNode)) goto pop; + } + + /* intersect leaf */ + assert(cur != BVH::invalidNode); + assert(cur != BVH::emptyNode); +#if defined(__AVX__) + STAT3(normal.trav_leaves, 1, popcnt(m_active), K); +#endif + if (unlikely(!m_active)) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); + octant_valid &= !terminated; + if (unlikely(none(octant_valid))) break; + tray.tfar = select(terminated, vfloat(neg_inf), tray.tfar); // ignore node intersections for terminated rays + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->mask = movemask(octant_valid); + stackPtr++; + } + } + } while(valid_bits); + + vfloat::store(valid & terminated, &ray.tfar, neg_inf); + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp new file mode 100644 index 0000000000..2137da6a25 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp @@ -0,0 +1,59 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.cpp" + +namespace embree +{ + namespace isa + { + //////////////////////////////////////////////////////////////////////////////// + /// BVH4Intersector4 Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoellerNoFilter, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA false> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoellerNoFilter,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA false> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridPluecker,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersectorK<4> >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersectorK<4> >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersectorK<4> >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersectorK<4> >)); + + //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); + //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); + + IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4> >)); + IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4MB> >)); + + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorK<4>> >)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorKMB<4>> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); + //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersectorKPluecker <4 COMMA 4 COMMA true> >)); + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersectorKPluecker <4 COMMA 4 COMMA true> >)); + + } +} + diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp new file mode 100644 index 0000000000..4a74d8468d --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp @@ -0,0 +1,528 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" + +#include "../common/scene.h" +#include + +namespace embree +{ + namespace isa + { + __aligned(64) static const int shiftTable[32] = { + (int)1 << 0, (int)1 << 1, (int)1 << 2, (int)1 << 3, (int)1 << 4, (int)1 << 5, (int)1 << 6, (int)1 << 7, + (int)1 << 8, (int)1 << 9, (int)1 << 10, (int)1 << 11, (int)1 << 12, (int)1 << 13, (int)1 << 14, (int)1 << 15, + (int)1 << 16, (int)1 << 17, (int)1 << 18, (int)1 << 19, (int)1 << 20, (int)1 << 21, (int)1 << 22, (int)1 << 23, + (int)1 << 24, (int)1 << 25, (int)1 << 26, (int)1 << 27, (int)1 << 28, (int)1 << 29, (int)1 << 30, (int)1 << 31 + }; + + template + __forceinline void BVHNIntersectorStream::intersect(Accel::Intersectors* __restrict__ This, + RayHitN** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + /* we may traverse an empty BVH in case all geometry was invalid */ + BVH* __restrict__ bvh = (BVH*) This->ptr; + if (bvh->root == BVH::emptyNode) + return; + + // Only the coherent code path is implemented + assert(context->isCoherent()); + intersectCoherent(This, (RayHitK**)inputPackets, numOctantRays, context); + } + + template + template + __forceinline void BVHNIntersectorStream::intersectCoherent(Accel::Intersectors* __restrict__ This, + RayHitK** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(context->isCoherent()); + + BVH* __restrict__ bvh = (BVH*) This->ptr; + __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes + assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + + __aligned(64) TravRayKStream packets[MAX_INTERNAL_STREAM_SIZE/K]; + __aligned(64) Frustum frustum; + + bool commonOctant = true; + const size_t m_active = initPacketsAndFrustum((RayK**)inputPackets, numOctantRays, packets, frustum, commonOctant); + if (unlikely(m_active == 0)) return; + + /* case of non-common origin */ + if (unlikely(!commonOctant)) + { + const size_t numPackets = (numOctantRays+K-1)/K; + for (size_t i = 0; i < numPackets; i++) + This->intersect(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); + return; + } + + stack[0].mask = m_active; + stack[0].parent = 0; + stack[0].child = bvh->root; + + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + + StackItemMaskCoherent* stackPtr = stack + 1; + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + + STAT3(normal.trav_stack_pop,1,1,1); + stackPtr--; + /*! pop next node */ + NodeRef cur = NodeRef(stackPtr->child); + size_t m_trav_active = stackPtr->mask; + assert(m_trav_active); + NodeRef parent = stackPtr->parent; + + while (1) + { + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + parent = cur; + + __aligned(64) size_t maskK[N]; + for (size_t i = 0; i < N; i++) + maskK[i] = m_trav_active; + vfloat dist; + const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); + if (unlikely(m_node_hit == 0)) goto pop; + + BVHNNodeTraverserStreamHitCoherent::traverseClosestHit(cur, m_trav_active, vbool((int)m_node_hit), dist, (size_t*)maskK, stackPtr); + assert(m_trav_active); + } + + /* non-root and leaf => full culling test for all rays */ + if (unlikely(parent != 0 && cur.isLeaf())) + { + const AABBNode* __restrict__ const node = parent.getAABBNode(); + size_t boxID = 0xff; + for (size_t i = 0; i < N; i++) + if (node->child(i) == cur) { boxID = i; break; } + assert(boxID < N); + assert(cur == node->child(boxID)); + m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; PrimitiveK* prim = (PrimitiveK*)cur.leaf(num); + + size_t bits = m_trav_active; + + /*! intersect stream of rays with all primitives */ + size_t lazy_node = 0; +#if defined(__SSE4_2__) + STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif + while(bits) + { + size_t i = bsf(bits) / K; + const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); + assert(m_isec & bits); + bits &= ~m_isec; + + TravRayKStream& p = packets[i]; + vbool m_valid = p.tnear <= p.tfar; + PrimitiveIntersectorK::intersectK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); + p.tfar = min(p.tfar, inputPackets[i]->tfar); + }; + + } // traversal + intersection + } + + template + __forceinline void BVHNIntersectorStream::occluded(Accel::Intersectors* __restrict__ This, + RayN** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + /* we may traverse an empty BVH in case all geometry was invalid */ + BVH* __restrict__ bvh = (BVH*) This->ptr; + if (bvh->root == BVH::emptyNode) + return; + + if (unlikely(context->isCoherent())) + occludedCoherent(This, (RayK**)inputPackets, numOctantRays, context); + else + occludedIncoherent(This, (RayK**)inputPackets, numOctantRays, context); + } + + template + template + __noinline void BVHNIntersectorStream::occludedCoherent(Accel::Intersectors* __restrict__ This, + RayK** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(context->isCoherent()); + + BVH* __restrict__ bvh = (BVH*)This->ptr; + __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes + assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + + /* inactive rays should have been filtered out before */ + __aligned(64) TravRayKStream packets[MAX_INTERNAL_STREAM_SIZE/K]; + __aligned(64) Frustum frustum; + + bool commonOctant = true; + size_t m_active = initPacketsAndFrustum(inputPackets, numOctantRays, packets, frustum, commonOctant); + + /* valid rays */ + if (unlikely(m_active == 0)) return; + + /* case of non-common origin */ + if (unlikely(!commonOctant)) + { + const size_t numPackets = (numOctantRays+K-1)/K; + for (size_t i = 0; i < numPackets; i++) + This->occluded(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); + return; + } + + stack[0].mask = m_active; + stack[0].parent = 0; + stack[0].child = bvh->root; + + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + + StackItemMaskCoherent* stackPtr = stack + 1; + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + + STAT3(normal.trav_stack_pop,1,1,1); + stackPtr--; + /*! pop next node */ + NodeRef cur = NodeRef(stackPtr->child); + size_t m_trav_active = stackPtr->mask & m_active; + if (unlikely(!m_trav_active)) continue; + assert(m_trav_active); + NodeRef parent = stackPtr->parent; + + while (1) + { + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + parent = cur; + + __aligned(64) size_t maskK[N]; + for (size_t i = 0; i < N; i++) + maskK[i] = m_trav_active; + + vfloat dist; + const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); + if (unlikely(m_node_hit == 0)) goto pop; + + BVHNNodeTraverserStreamHitCoherent::traverseAnyHit(cur, m_trav_active, vbool((int)m_node_hit), (size_t*)maskK, stackPtr); + assert(m_trav_active); + } + + /* non-root and leaf => full culling test for all rays */ + if (unlikely(parent != 0 && cur.isLeaf())) + { + const AABBNode* __restrict__ const node = parent.getAABBNode(); + size_t boxID = 0xff; + for (size_t i = 0; i < N; i++) + if (node->child(i) == cur) { boxID = i; break; } + assert(boxID < N); + assert(cur == node->child(boxID)); + m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; PrimitiveK* prim = (PrimitiveK*)cur.leaf(num); + + size_t bits = m_trav_active & m_active; + /*! intersect stream of rays with all primitives */ + size_t lazy_node = 0; +#if defined(__SSE4_2__) + STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif + while (bits) + { + size_t i = bsf(bits) / K; + const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); + assert(m_isec & bits); + bits &= ~m_isec; + TravRayKStream& p = packets[i]; + vbool m_valid = p.tnear <= p.tfar; + vbool m_hit = PrimitiveIntersectorK::occludedK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); + inputPackets[i]->tfar = select(m_hit & m_valid, vfloat(neg_inf), inputPackets[i]->tfar); + m_active &= ~((size_t)movemask(m_hit) << (i*K)); + } + + } // traversal + intersection + } + + + template + template + __forceinline void BVHNIntersectorStream::occludedIncoherent(Accel::Intersectors* __restrict__ This, + RayK** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(!context->isCoherent()); + assert(types & BVH_FLAG_ALIGNED_NODE); + + __aligned(64) TravRayKStream packet[MAX_INTERNAL_STREAM_SIZE/K]; + + assert(numOctantRays <= 32); + const size_t numPackets = (numOctantRays+K-1)/K; + size_t m_active = 0; + for (size_t i = 0; i < numPackets; i++) + { + const vfloat tnear = inputPackets[i]->tnear(); + const vfloat tfar = inputPackets[i]->tfar; + vbool m_valid = (tnear <= tfar) & (tnear >= 0.0f); + m_active |= (size_t)movemask(m_valid) << (K*i); + const Vec3vf& org = inputPackets[i]->org; + const Vec3vf& dir = inputPackets[i]->dir; + vfloat packet_min_dist = max(tnear, 0.0f); + vfloat packet_max_dist = select(m_valid, tfar, neg_inf); + new (&packet[i]) TravRayKStream(org, dir, packet_min_dist, packet_max_dist); + } + + BVH* __restrict__ bvh = (BVH*)This->ptr; + + StackItemMaskT stack[stackSizeSingle]; // stack of nodes + StackItemMaskT* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].mask = m_active; + + size_t terminated = ~m_active; + + /* near/far offsets based on first ray */ + const NearFarPrecalculations nf(Vec3fa(packet[0].rdir.x[0], packet[0].rdir.y[0], packet[0].rdir.z[0]), N); + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + STAT3(shadow.trav_stack_pop,1,1,1); + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + size_t cur_mask = stackPtr->mask & (~terminated); + if (unlikely(cur_mask == 0)) continue; + + while (true) + { + /*! stop if we found a leaf node */ + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + + const vint vmask = traverseIncoherentStream(cur_mask, packet, node, nf, shiftTable); + + size_t mask = movemask(vmask != vint(zero)); + if (unlikely(mask == 0)) goto pop; + + __aligned(64) unsigned int child_mask[N]; + vint::storeu(child_mask, vmask); // this explicit store here causes much better code generation + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + assert(r < N); + cur = node->child(r); + BVHN::prefetch(cur,types); + cur_mask = child_mask[r]; + + /* simple in order sequence */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) continue; + stackPtr->ptr = cur; + stackPtr->mask = cur_mask; + stackPtr++; + + for (; ;) + { + r = bscf(mask); + assert(r < N); + + cur = node->child(r); + BVHN::prefetch(cur,types); + cur_mask = child_mask[r]; + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) break; + stackPtr->ptr = cur; + stackPtr->mask = cur_mask; + stackPtr++; + } + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves,1,1,1); + size_t num; PrimitiveK* prim = (PrimitiveK*)cur.leaf(num); + + size_t bits = cur_mask; + size_t lazy_node = 0; + + for (; bits != 0;) + { + const size_t rayID = bscf(bits); + + RayK &ray = *inputPackets[rayID / K]; + const size_t k = rayID % K; + if (PrimitiveIntersectorK::occluded(This, ray, k, context, prim, num, lazy_node)) + { + ray.tfar[k] = neg_inf; + terminated |= (size_t)1 << rayID; + } + + /* lazy node */ + if (unlikely(lazy_node)) + { + stackPtr->ptr = lazy_node; + stackPtr->mask = cur_mask; + stackPtr++; + } + } + + if (unlikely(terminated == (size_t)-1)) break; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// ArrayIntersectorKStream Definitions + //////////////////////////////////////////////////////////////////////////////// + + template + struct Triangle4IntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Triangle4vIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Triangle4iIntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Triangle4iIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4vIntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4iIntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4vIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4iIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + struct ObjectIntersectorStream { + template using Type = ArrayIntersectorKStream>; + }; + + struct InstanceIntersectorStream { + template using Type = ArrayIntersectorKStream>; + }; + + // ===================================================================================================== + // ===================================================================================================== + // ===================================================================================================== + + template + void BVHNIntersectorStreamPacketFallback::intersect(Accel::Intersectors* __restrict__ This, + RayHitN** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + if (unlikely(context->isCoherent())) + intersectK(This, (RayHitK**)inputRays, numTotalRays, context); + else + intersectK(This, (RayHitK**)inputRays, numTotalRays, context); + } + + template + void BVHNIntersectorStreamPacketFallback::occluded(Accel::Intersectors* __restrict__ This, + RayN** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + if (unlikely(context->isCoherent())) + occludedK(This, (RayK**)inputRays, numTotalRays, context); + else + occludedK(This, (RayK**)inputRays, numTotalRays, context); + } + + template + template + __noinline void BVHNIntersectorStreamPacketFallback::intersectK(Accel::Intersectors* __restrict__ This, + RayHitK** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + /* fallback to packets */ + for (size_t i = 0; i < numTotalRays; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(numTotalRays)); + RayHitK& ray = *(inputRays[i / K]); + valid &= ray.tnear() <= ray.tfar; + This->intersect(valid, ray, context); + } + } + + template + template + __noinline void BVHNIntersectorStreamPacketFallback::occludedK(Accel::Intersectors* __restrict__ This, + RayK** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + /* fallback to packets */ + for (size_t i = 0; i < numTotalRays; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(numTotalRays)); + RayK& ray = *(inputRays[i / K]); + valid &= ray.tnear() <= ray.tfar; + This->occluded(valid, ray, context); + } + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp new file mode 100644 index 0000000000..c3e5f137b8 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp @@ -0,0 +1,36 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.cpp" + +namespace embree +{ + namespace isa + { + + //////////////////////////////////////////////////////////////////////////////// + /// General BVHIntersectorStreamPacketFallback Intersector + //////////////////////////////////////////////////////////////////////////////// + + DEFINE_INTERSECTORN(BVH4IntersectorStreamPacketFallback,BVHNIntersectorStreamPacketFallback<4>); + + //////////////////////////////////////////////////////////////////////////////// + /// BVH4IntersectorStream Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4iIntersectorStreamMoeller>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4vIntersectorStreamPluecker>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4iIntersectorStreamPluecker>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoellerNoFilter, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller>)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoellerNoFilter,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4iIntersectorStreamMoeller>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4vIntersectorStreamPluecker>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4iIntersectorStreamPluecker>)); + + IF_ENABLED_USER(DEFINE_INTERSECTORN(BVH4VirtualIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA ObjectIntersectorStream>)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTORN(BVH4InstanceIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA InstanceIntersectorStream>)); + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp new file mode 100644 index 0000000000..b858eb163f --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp @@ -0,0 +1,657 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream_filters.h" +#include "bvh_intersector_stream.h" + +namespace embree +{ + namespace isa + { + template + __noinline void RayStreamFilter::filterAOS(Scene* scene, void* _rayN, size_t N, size_t stride, IntersectContext* context) + { + RayStreamAOS rayN(_rayN); + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from AOS to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const vint offset = vij * int(stride); + const size_t packetIndex = j / K; + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to AOS */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const vint offset = vij * int(stride); + const size_t packetIndex = j / K; + rayN.setHitByOffset(valid, offset, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const Ray& ray = rayN.getRayByOffset(inputRayID * stride); + + /* skip invalid rays */ + if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = movemask(vfloat4(Vec3fa(ray.dir)) < 0.0f) & 0x7; + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayIDs = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayIDs[j] * int(stride); + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayIDs[j] * int(stride); + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(N)); + const vint offset = vi * int(stride); + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + + template + __noinline void RayStreamFilter::filterAOP(Scene* scene, void** _rayN, size_t N, IntersectContext* context) + { + RayStreamAOP rayN(_rayN); + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from AOP to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t packetIndex = j / K; + + RayTypeK ray = rayN.getRayByIndex(valid, vij); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to AOP */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t packetIndex = j / K; + + rayN.setHitByIndex(valid, vij, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const Ray& ray = rayN.getRayByIndex(inputRayID); + + /* skip invalid rays */ + if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = movemask(lt_mask(ray.dir,Vec3fa(0.0f))); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayIDs = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint index = *(vint*)&rayIDs[j]; + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByIndex(valid, index); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint index = *(vint*)&rayIDs[j]; + rayN.setHitByIndex(valid, index, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(N)); + + RayTypeK ray = rayN.getRayByIndex(valid, vi); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByIndex(valid, vi, ray); + } + } + } + + template + __noinline void RayStreamFilter::filterSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) + { + const size_t rayDataAlignment = (size_t)rayData % (K*sizeof(float)); + const size_t offsetAlignment = (size_t)stride % (K*sizeof(float)); + + /* fast path for packets with the correct width and data alignment */ + if (likely(N == K && + !rayDataAlignment && + !offsetAlignment)) + { + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + size_t packetIndex = 0; + for (size_t i = 0; i < numPackets; i++) + { + const size_t offset = i * stride; + RayTypeK& ray = *(RayTypeK*)(rayData + offset); + rayPtrs[packetIndex++] = &ray; + + /* trace as stream */ + if (unlikely(packetIndex == MAX_INTERNAL_STREAM_SIZE / K)) + { + const size_t size = packetIndex*K; + scene->intersectors.intersectN(rayPtrs, size, context); + packetIndex = 0; + } + } + + /* flush remaining packets */ + if (unlikely(packetIndex > 0)) + { + const size_t size = packetIndex*K; + scene->intersectors.intersectN(rayPtrs, size, context); + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + RayStreamSOA rayN(rayData, K); + + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N*numPackets;) + { + const size_t offset = (inputRayID / K) * stride + (inputRayID % K) * sizeof(float); + + /* skip invalid rays */ + if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays + #if defined(EMBREE_IGNORE_INVALID_RAYS) + __aligned(64) Ray ray = rayN.getRayByOffset(offset); + if (unlikely(!ray.valid())) { inputRayID++; continue; } + #endif + + const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayOffsets = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < numPackets; i++) + { + const size_t offset = i * stride; + RayTypeK& ray = *(RayTypeK*)(rayData + offset); + const vbool valid = ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + } + } + } + else + { + /* fallback to packets for arbitrary packet size and alignment */ + for (size_t i = 0; i < numPackets; i++) + { + const size_t offsetN = i * stride; + RayStreamSOA rayN(rayData + offsetN, N); + + for (size_t j = 0; j < N; j += K) + { + const size_t offset = j * sizeof(float); + vbool valid = (vint(int(j)) + vint(step)) < vint(int(N)); + RayTypeK ray = rayN.getRayByOffset(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + } + + template + __noinline void RayStreamFilter::filterSOP(Scene* scene, const void* _rayN, size_t N, IntersectContext* context) + { + RayStreamSOP& rayN = *(RayStreamSOP*)_rayN; + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from SOP to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t offset = (i+j) * sizeof(float); + const size_t packetIndex = j / K; + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to SOP */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t offset = (i+j) * sizeof(float); + const size_t packetIndex = j / K; + + rayN.setHitByOffset(valid, offset, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const size_t offset = inputRayID * sizeof(float); + /* skip invalid rays */ + if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + __aligned(64) Ray ray = rayN.getRayByOffset(offset); + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayOffsets = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(N)); + const size_t offset = i * sizeof(float); + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + + + void RayStreamFilter::intersectAOS(Scene* scene, RTCRayHit* _rayN, size_t N, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOS(scene, _rayN, N, stride, context); + else + filterAOS(scene, _rayN, N, stride, context); + } + + void RayStreamFilter::occludedAOS(Scene* scene, RTCRay* _rayN, size_t N, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOS(scene, _rayN, N, stride, context); + else + filterAOS(scene, _rayN, N, stride, context); + } + + void RayStreamFilter::intersectAOP(Scene* scene, RTCRayHit** _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOP(scene, (void**)_rayN, N, context); + else + filterAOP(scene, (void**)_rayN, N, context); + } + + void RayStreamFilter::occludedAOP(Scene* scene, RTCRay** _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOP(scene, (void**)_rayN, N, context); + else + filterAOP(scene, (void**)_rayN, N, context); + } + + void RayStreamFilter::intersectSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOA(scene, rayData, N, numPackets, stride, context); + else + filterSOA(scene, rayData, N, numPackets, stride, context); + } + + void RayStreamFilter::occludedSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOA(scene, rayData, N, numPackets, stride, context); + else + filterSOA(scene, rayData, N, numPackets, stride, context); + } + + void RayStreamFilter::intersectSOP(Scene* scene, const RTCRayHitNp* _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOP(scene, _rayN, N, context); + else + filterSOP(scene, _rayN, N, context); + } + + void RayStreamFilter::occludedSOP(Scene* scene, const RTCRayNp* _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOP(scene, _rayN, N, context); + else + filterSOP(scene, _rayN, N, context); + } + + + RayStreamFilterFuncs rayStreamFilterFuncs() { + return RayStreamFilterFuncs(RayStreamFilter::intersectAOS, RayStreamFilter::intersectAOP, RayStreamFilter::intersectSOA, RayStreamFilter::intersectSOP, + RayStreamFilter::occludedAOS, RayStreamFilter::occludedAOP, RayStreamFilter::occludedSOA, RayStreamFilter::occludedSOP); + } + }; +}; diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h index 80a8ab2a56..2bf7e93587 100644 --- a/thirdparty/embree/kernels/config.h +++ b/thirdparty/embree/kernels/config.h @@ -16,7 +16,7 @@ /* #undef EMBREE_GEOMETRY_INSTANCE */ /* #undef EMBREE_GEOMETRY_GRID */ /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h index 10f315cee7..470e15f03e 100644 --- a/thirdparty/embree/kernels/hash.h +++ b/thirdparty/embree/kernels/hash.h @@ -2,4 +2,4 @@ // Copyright 2009-2020 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab" +#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd"