diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub index 6e7b3e7b8d..1fdc8fe1b3 100644 --- a/modules/raycast/SCsub +++ b/modules/raycast/SCsub @@ -55,6 +55,9 @@ if env["builtin_embree"]: "kernels/bvh/bvh_builder_sah_mb.cpp", "kernels/bvh/bvh_builder_twolevel.cpp", "kernels/bvh/bvh_intersector1_bvh4.cpp", + "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_filters.cpp", ] thirdparty_sources = [thirdparty_dir + file for file in embree_src] diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py index 31a25a318f..e31d88b741 100644 --- a/modules/raycast/godot_update_embree.py +++ b/modules/raycast/godot_update_embree.py @@ -61,6 +61,11 @@ cpp_files = [ "kernels/bvh/bvh_builder_twolevel.cpp", "kernels/bvh/bvh_intersector1.cpp", "kernels/bvh/bvh_intersector1_bvh4.cpp", + "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_bvh4.cpp", + "kernels/bvh/bvh_intersector_stream_filters.cpp", + "kernels/bvh/bvh_intersector_hybrid.cpp", + "kernels/bvh/bvh_intersector_stream.cpp", ] os.chdir("../../thirdparty") @@ -117,7 +122,7 @@ with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file: /* #undef EMBREE_GEOMETRY_INSTANCE */ /* #undef EMBREE_GEOMETRY_GRID */ /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 @@ -249,3 +254,8 @@ with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as con os.chdir("..") shutil.rmtree("embree-tmp") + +subprocess.run(["git", "restore", "embree/patches"]) + +for patch in os.listdir("embree/patches"): + subprocess.run(["git", "apply", "embree/patches/" + patch]) diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h index 3a9819c9f1..62b7b6f4dc 100644 --- a/thirdparty/embree/include/embree3/rtcore_config.h +++ b/thirdparty/embree/include/embree3/rtcore_config.h @@ -6,9 +6,9 @@ #define RTC_VERSION_MAJOR 3 #define RTC_VERSION_MINOR 13 -#define RTC_VERSION_PATCH 0 -#define RTC_VERSION 31300 -#define RTC_VERSION_STRING "3.13.0" +#define RTC_VERSION_PATCH 1 +#define RTC_VERSION 31301 +#define RTC_VERSION_STRING "3.13.1" #define RTC_MAX_INSTANCE_LEVEL_COUNT 1 diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp new file mode 100644 index 0000000000..6e9a5a538e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp @@ -0,0 +1,917 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.h" +#include "bvh_traverser1.h" +#include "node_intersector1.h" +#include "node_intersector_packet.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/curveNv_intersector.h" +#include "../geometry/curveNi_intersector.h" +#include "../geometry/curveNi_mb_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" +#include "../geometry/subgrid_intersector.h" +#include "../geometry/subgrid_mb_intersector.h" +#include "../geometry/curve_intersector_virtual.h" + +#define SWITCH_DURING_DOWN_TRAVERSAL 1 +#define FORCE_SINGLE_MODE 0 + +#define ENABLE_FAST_COHERENT_CODEPATHS 1 + +namespace embree +{ + namespace isa + { + template + void BVHNIntersectorKHybrid::intersect1(Accel::Intersectors* This, + const BVH* bvh, + NodeRef root, + size_t k, + Precalculations& pre, + RayHitK& ray, + const TravRayK& tray, + IntersectContext* context) + { + /* stack state */ + StackItemT stack[stackSizeSingle]; // stack of nodes + StackItemT* stackPtr = stack + 1; // current stack pointer + StackItemT* stackEnd = stack + stackSizeSingle; + stack[0].ptr = root; + stack[0].dist = neg_inf; + + /* load the ray into SIMD registers */ + TravRay tray1; + tray1.template init(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ + if (unlikely(*(float*)&stackPtr->dist > ray.tfar[k])) + continue; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat tNear; + STAT3(normal.trav_nodes, 1, 1, 1); + bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray1, ray.time()[k], tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + BVHNNodeTraverser1Hit::traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(This, pre, ray, k, context, prim, num, tray1, lazy_node); + + tray1.tfar = ray.tfar[k]; + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + } + + template + void BVHNIntersectorKHybrid::intersect(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayHitK& __restrict__ ray, + IntersectContext* __restrict__ context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 + assert(context); + if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) + { + intersectCoherent(valid_i, This, ray, context); + return; + } +#endif + + /* filter out invalid rays */ + vbool valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(movemask(valid))], 1, 1, 1); +#endif + + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + if (single) + { + tray.tnear = select(valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(valid, org_ray_tfar , vfloat(neg_inf)); + + for (; valid_bits!=0; ) { + const size_t i = bscf(valid_bits); + intersect1(This, bvh, bvh->root, i, pre, ray, tray, context); + } + return; + } + + /* determine switch threshold based on flags */ + const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + + vint octant = ray.octant(); + octant = select(valid, octant, vint(0xffffffff)); + + /* test whether we have ray with opposing direction signs in the packet */ + bool split = false; + { + size_t bits = valid_bits; + vbool vsplit( false ); + do + { + const size_t valid_index = bsf(bits); + vbool octant_valid = octant[valid_index] == octant; + bits &= ~(size_t)movemask(octant_valid); + vsplit |= vint(octant[valid_index]) == (octant^vint(0x7)); + } while (bits); + if (any(vsplit)) split = true; + } + + do + { + const size_t valid_index = bsf(valid_bits); + const vint diff_octant = vint(octant[valid_index])^octant; + const vint count_diff_octant = \ + ((diff_octant >> 2) & 1) + + ((diff_octant >> 1) & 1) + + ((diff_octant >> 0) & 1); + + vbool octant_valid = (count_diff_octant <= 1) & (octant != vint(0xffffffff)); + if (!single || !split) octant_valid = valid; // deactivate octant sorting in pure chunk mode, otherwise instance traversal performance goes down + + + octant = select(octant_valid,vint(0xffffffff),octant); + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar , vfloat(neg_inf)); + + /* allocate stack and push root node */ + vfloat stack_near[stackSizeChunk]; + NodeRef stack_node[stackSizeChunk]; + stack_node[0] = BVH::invalidNode; + stack_near[0] = inf; + stack_node[1] = bvh->root; + stack_near[1] = tray.tnear; + NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; + NodeRef* __restrict__ sptr_node = stack_node + 2; + vfloat* __restrict__ sptr_near = stack_near + 2; + + while (1) pop: + { + /* pop next node from stack */ + assert(sptr_node > stack_node); + sptr_node--; + sptr_near--; + NodeRef cur = *sptr_node; + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* cull node if behind closest hit point */ + vfloat curDist = *sptr_near; + const vbool active = curDist < tray.tfar; + if (unlikely(none(active))) + continue; + + /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 + if (single) +#endif + { + size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 + if (unlikely(popcnt(bits) <= switchThreshold)) +#endif + { + for (; bits!=0; ) { + const size_t i = bscf(bits); + intersect1(This, bvh, cur, i, pre, ray, tray, context); + } + tray.tfar = min(tray.tfar, ray.tfar); + continue; + } + } +#endif + while (likely(!cur.isLeaf())) + { + /* process nodes */ + const vbool valid_node = tray.tfar > curDist; + STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const BaseNode* __restrict__ const node = nodeRef.baseNode(); + + /* set cur to invalid */ + cur = BVH::emptyNode; + curDist = pos_inf; + + size_t num_child_hits = 0; + + for (unsigned i = 0; i < N; i++) + { + const NodeRef child = node->children[i]; + if (unlikely(child == BVH::emptyNode)) break; + vfloat lnearP; + vbool lhit = valid_node; + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + /* if we hit the child we choose to continue with that child if it + is closer than the current next child, or we push it onto the stack */ + if (likely(any(lhit))) + { + assert(sptr_node < stackEnd); + assert(child != BVH::emptyNode); + const vfloat childDist = select(lhit, lnearP, inf); + /* push cur node onto stack and continue with hit child */ + if (any(childDist < curDist)) + { + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + *sptr_node = cur; sptr_node++; + *sptr_near = curDist; sptr_near++; + } + curDist = childDist; + cur = child; + } + + /* push hit child onto stack */ + else { + num_child_hits++; + *sptr_node = child; sptr_node++; + *sptr_near = childDist; sptr_near++; + } + } + } + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[num_child_hits], 1, 1, 1); +#endif + + if (unlikely(cur == BVH::emptyNode)) + goto pop; + + /* improved distance sorting for 3 or more hits */ + if (unlikely(num_child_hits >= 2)) + { + if (any(sptr_near[-2] < sptr_near[-1])) + { + std::swap(sptr_near[-2],sptr_near[-1]); + std::swap(sptr_node[-2],sptr_node[-1]); + } + if (unlikely(num_child_hits >= 3)) + { + if (any(sptr_near[-3] < sptr_near[-1])) + { + std::swap(sptr_near[-3],sptr_near[-1]); + std::swap(sptr_node[-3],sptr_node[-1]); + } + if (any(sptr_near[-3] < sptr_near[-2])) + { + std::swap(sptr_near[-3],sptr_near[-2]); + std::swap(sptr_node[-3],sptr_node[-2]); + } + } + } + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 + if (single) + { + // seems to be the best place for testing utilization + if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) + { + *sptr_node++ = cur; + *sptr_near++ = curDist; + goto pop; + } + } +#endif + } + + /* return if stack is empty */ + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* intersect leaf */ + assert(cur != BVH::emptyNode); + const vbool valid_leaf = tray.tfar > curDist; + STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); + tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); + + if (unlikely(lazy_node)) { + *sptr_node = lazy_node; sptr_node++; + *sptr_near = neg_inf; sptr_near++; + } + } + } while(valid_bits); + } + + + template + void BVHNIntersectorKHybrid::intersectCoherent(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayHitK& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* filter out invalid rays */ + vbool valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + vint octant = ray.octant(); + octant = select(valid, octant, vint(0xffffffff)); + + do + { + const size_t valid_index = bsf(valid_bits); + const vbool octant_valid = octant[valid_index] == octant; + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar , vfloat(neg_inf)); + + Frustum frustum; + frustum.template init(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + + StackItemT stack[stackSizeSingle]; // stack of nodes + StackItemT* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + while (1) pop: + { + /* pop next node from stack */ + if (unlikely(stackPtr == stack)) break; + + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* cull node if behind closest hit point */ + vfloat curDist = *(float*)&stackPtr->dist; + const vbool active = curDist < tray.tfar; + if (unlikely(none(active))) continue; + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + + vfloat fmin; + size_t m_frustum_node = intersectNodeFrustum(node, frustum, fmin); + + if (unlikely(!m_frustum_node)) goto pop; + cur = BVH::emptyNode; + curDist = pos_inf; + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif + size_t num_child_hits = 0; + do { + const size_t i = bscf(m_frustum_node); + vfloat lnearP; + vbool lhit = false; // motion blur is not supported, so the initial value will be ignored + STAT3(normal.trav_nodes, 1, 1, 1); + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + if (likely(any(lhit))) + { + const vfloat childDist = fmin[i]; + const NodeRef child = node->child(i); + BVHN::prefetch(child); + if (any(childDist < curDist)) + { + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + stackPtr->ptr = cur; + *(float*)&stackPtr->dist = toScalar(curDist); + stackPtr++; + } + curDist = childDist; + cur = child; + } + /* push hit child onto stack */ + else { + num_child_hits++; + stackPtr->ptr = child; + *(float*)&stackPtr->dist = toScalar(childDist); + stackPtr++; + } + } + } while(m_frustum_node); + + if (unlikely(cur == BVH::emptyNode)) goto pop; + + /* improved distance sorting for 3 or more hits */ + if (unlikely(num_child_hits >= 2)) + { + if (stackPtr[-2].dist < stackPtr[-1].dist) + std::swap(stackPtr[-2],stackPtr[-1]); + if (unlikely(num_child_hits >= 3)) + { + if (stackPtr[-3].dist < stackPtr[-1].dist) + std::swap(stackPtr[-3],stackPtr[-1]); + if (stackPtr[-3].dist < stackPtr[-2].dist) + std::swap(stackPtr[-3],stackPtr[-2]); + } + } + } + + /* intersect leaf */ + assert(cur != BVH::invalidNode); + assert(cur != BVH::emptyNode); + const vbool valid_leaf = tray.tfar > curDist; + STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); + + /* reduce max distance interval on successful intersection */ + if (likely(any((ray.tfar < tray.tfar) & valid_leaf))) + { + tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); + frustum.template updateMaxDist(tray.tfar); + } + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + + } while(valid_bits); + } + + // =================================================================================================================================================================== + // =================================================================================================================================================================== + // =================================================================================================================================================================== + + template + bool BVHNIntersectorKHybrid::occluded1(Accel::Intersectors* This, + const BVH* bvh, + NodeRef root, + size_t k, + Precalculations& pre, + RayK& ray, + const TravRayK& tray, + IntersectContext* context) + { + /* stack state */ + NodeRef stack[stackSizeSingle]; // stack of nodes that still need to get traversed + NodeRef* stackPtr = stack+1; // current stack pointer + NodeRef* stackEnd = stack+stackSizeSingle; + stack[0] = root; + + /* load the ray into SIMD registers */ + TravRay tray1; + tray1.template init(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = (NodeRef)*stackPtr; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat tNear; + STAT3(shadow.trav_nodes, 1, 1, 1); + bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray1, ray.time()[k], tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + BVHNNodeTraverser1Hit::traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves, 1, 1, 1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + + size_t lazy_node = 0; + if (PrimitiveIntersectorK::occluded(This, pre, ray, k, context, prim, num, tray1, lazy_node)) { + ray.tfar[k] = neg_inf; + return true; + } + + if (unlikely(lazy_node)) { + *stackPtr = lazy_node; + stackPtr++; + } + } + return false; + } + + template + void BVHNIntersectorKHybrid::occluded(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayK& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 + assert(context); + if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) + { + occludedCoherent(valid_i, This, ray, context); + return; + } +#endif + + /* filter out already occluded and invalid rays */ + vbool valid = (*valid_i == -1) & (ray.tfar >= 0.0f); +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + const size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid, ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + tray.tnear = select(valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(valid, org_ray_tfar , vfloat(neg_inf)); + + vbool terminated = !valid; + const vfloat inf = vfloat(pos_inf); + + /* determine switch threshold based on flags */ + const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + + /* allocate stack and push root node */ + vfloat stack_near[stackSizeChunk]; + NodeRef stack_node[stackSizeChunk]; + stack_node[0] = BVH::invalidNode; + stack_near[0] = inf; + stack_node[1] = bvh->root; + stack_near[1] = tray.tnear; + NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; + NodeRef* __restrict__ sptr_node = stack_node + 2; + vfloat* __restrict__ sptr_near = stack_near + 2; + + while (1) pop: + { + /* pop next node from stack */ + assert(sptr_node > stack_node); + sptr_node--; + sptr_near--; + NodeRef cur = *sptr_node; + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + /* cull node if behind closest hit point */ + vfloat curDist = *sptr_near; + const vbool active = curDist < tray.tfar; + if (unlikely(none(active))) + continue; + + /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 + if (single) +#endif + { + size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 + if (unlikely(popcnt(bits) <= switchThreshold)) +#endif + { + for (; bits!=0; ) { + const size_t i = bscf(bits); + if (occluded1(This, bvh, cur, i, pre, ray, tray, context)) + set(terminated, i); + } + if (all(terminated)) break; + tray.tfar = select(terminated, vfloat(neg_inf), tray.tfar); + continue; + } + } +#endif + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + const vbool valid_node = tray.tfar > curDist; + STAT3(shadow.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const BaseNode* __restrict__ const node = nodeRef.baseNode(); + + /* set cur to invalid */ + cur = BVH::emptyNode; + curDist = pos_inf; + + for (unsigned i = 0; i < N; i++) + { + const NodeRef child = node->children[i]; + if (unlikely(child == BVH::emptyNode)) break; + vfloat lnearP; + vbool lhit = valid_node; + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + /* if we hit the child we push the previously hit node onto the stack, and continue with the currently hit child */ + if (likely(any(lhit))) + { + assert(sptr_node < stackEnd); + assert(child != BVH::emptyNode); + const vfloat childDist = select(lhit, lnearP, inf); + + /* push 'cur' node onto stack and continue with hit child */ + if (likely(cur != BVH::emptyNode)) { + *sptr_node = cur; sptr_node++; + *sptr_near = curDist; sptr_near++; + } + curDist = childDist; + cur = child; + } + } + if (unlikely(cur == BVH::emptyNode)) + goto pop; + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 + if (single) + { + // seems to be the best place for testing utilization + if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) + { + *sptr_node++ = cur; + *sptr_near++ = curDist; + goto pop; + } + } +#endif + } + + /* return if stack is empty */ + if (unlikely(cur == BVH::invalidNode)) { + assert(sptr_node == stack_node); + break; + } + + + /* intersect leaf */ + assert(cur != BVH::emptyNode); + const vbool valid_leaf = tray.tfar > curDist; + STAT3(shadow.trav_leaves, 1, popcnt(valid_leaf), K); + if (unlikely(none(valid_leaf))) continue; + size_t items; const Primitive* prim = (Primitive*) cur.leaf(items); + + size_t lazy_node = 0; + terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); + if (all(terminated)) break; + tray.tfar = select(terminated, vfloat(neg_inf), tray.tfar); // ignore node intersections for terminated rays + + if (unlikely(lazy_node)) { + *sptr_node = lazy_node; sptr_node++; + *sptr_near = neg_inf; sptr_near++; + } + } + + vfloat::store(valid & terminated, &ray.tfar, neg_inf); + } + + + template + void BVHNIntersectorKHybrid::occludedCoherent(vint* __restrict__ valid_i, + Accel::Intersectors* __restrict__ This, + RayK& __restrict__ ray, + IntersectContext* context) + { + BVH* __restrict__ bvh = (BVH*)This->ptr; + + /* filter out invalid rays */ + vbool valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) + valid &= ray.valid(); +#endif + + /* return if there are no valid rays */ + size_t valid_bits = movemask(valid); + if (unlikely(valid_bits == 0)) return; + + /* verify correct input */ + assert(all(valid, ray.valid())); + assert(all(valid, ray.tnear() >= 0.0f)); + assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); + Precalculations pre(valid,ray); + + /* load ray */ + TravRayK tray(ray.org, ray.dir, single ? N : 0); + const vfloat org_ray_tnear = max(ray.tnear(), 0.0f); + const vfloat org_ray_tfar = max(ray.tfar , 0.0f); + + vbool terminated = !valid; + + vint octant = ray.octant(); + octant = select(valid, octant, vint(0xffffffff)); + + do + { + const size_t valid_index = bsf(valid_bits); + vbool octant_valid = octant[valid_index] == octant; + valid_bits &= ~(size_t)movemask(octant_valid); + + tray.tnear = select(octant_valid, org_ray_tnear, vfloat(pos_inf)); + tray.tfar = select(octant_valid, org_ray_tfar, vfloat(neg_inf)); + + Frustum frustum; + frustum.template init(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + + StackItemMaskT stack[stackSizeSingle]; // stack of nodes + StackItemMaskT* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].mask = movemask(octant_valid); + + while (1) pop: + { + /* pop next node from stack */ + if (unlikely(stackPtr == stack)) break; + + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* cull node of active rays have already been terminated */ + size_t m_active = (size_t)stackPtr->mask & (~(size_t)movemask(terminated)); + + if (unlikely(m_active == 0)) continue; + + while (likely(!cur.isLeaf())) + { + /* process nodes */ + //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); + const NodeRef nodeRef = cur; + const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + + vfloat fmin; + size_t m_frustum_node = intersectNodeFrustum(node, frustum, fmin); + + if (unlikely(!m_frustum_node)) goto pop; + cur = BVH::emptyNode; + m_active = 0; + +#if defined(__AVX__) + //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif + size_t num_child_hits = 0; + do { + const size_t i = bscf(m_frustum_node); + vfloat lnearP; + vbool lhit = false; // motion blur is not supported, so the initial value will be ignored + STAT3(normal.trav_nodes, 1, 1, 1); + BVHNNodeIntersectorK::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + + if (likely(any(lhit))) + { + const NodeRef child = node->child(i); + assert(child != BVH::emptyNode); + BVHN::prefetch(child); + if (likely(cur != BVH::emptyNode)) { + num_child_hits++; + stackPtr->ptr = cur; + stackPtr->mask = m_active; + stackPtr++; + } + cur = child; + m_active = movemask(lhit); + } + } while(m_frustum_node); + + if (unlikely(cur == BVH::emptyNode)) goto pop; + } + + /* intersect leaf */ + assert(cur != BVH::invalidNode); + assert(cur != BVH::emptyNode); +#if defined(__AVX__) + STAT3(normal.trav_leaves, 1, popcnt(m_active), K); +#endif + if (unlikely(!m_active)) continue; + size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + + size_t lazy_node = 0; + terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); + octant_valid &= !terminated; + if (unlikely(none(octant_valid))) break; + tray.tfar = select(terminated, vfloat(neg_inf), tray.tfar); // ignore node intersections for terminated rays + + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->mask = movemask(octant_valid); + stackPtr++; + } + } + } while(valid_bits); + + vfloat::store(valid & terminated, &ray.tfar, neg_inf); + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp new file mode 100644 index 0000000000..2137da6a25 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp @@ -0,0 +1,59 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.cpp" + +namespace embree +{ + namespace isa + { + //////////////////////////////////////////////////////////////////////////////// + /// BVH4Intersector4 Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoellerNoFilter, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA false> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoellerNoFilter,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA false> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridPluecker,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersectorK<4> >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersectorK<4> >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersectorK<4> >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersectorK<4> >)); + + //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); + //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); + + IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4> >)); + IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4MB> >)); + + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorK<4>> >)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorKMB<4>> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); + //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersectorKPluecker <4 COMMA 4 COMMA true> >)); + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersectorKPluecker <4 COMMA 4 COMMA true> >)); + + } +} + diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp new file mode 100644 index 0000000000..4a74d8468d --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp @@ -0,0 +1,528 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" + +#include "../common/scene.h" +#include + +namespace embree +{ + namespace isa + { + __aligned(64) static const int shiftTable[32] = { + (int)1 << 0, (int)1 << 1, (int)1 << 2, (int)1 << 3, (int)1 << 4, (int)1 << 5, (int)1 << 6, (int)1 << 7, + (int)1 << 8, (int)1 << 9, (int)1 << 10, (int)1 << 11, (int)1 << 12, (int)1 << 13, (int)1 << 14, (int)1 << 15, + (int)1 << 16, (int)1 << 17, (int)1 << 18, (int)1 << 19, (int)1 << 20, (int)1 << 21, (int)1 << 22, (int)1 << 23, + (int)1 << 24, (int)1 << 25, (int)1 << 26, (int)1 << 27, (int)1 << 28, (int)1 << 29, (int)1 << 30, (int)1 << 31 + }; + + template + __forceinline void BVHNIntersectorStream::intersect(Accel::Intersectors* __restrict__ This, + RayHitN** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + /* we may traverse an empty BVH in case all geometry was invalid */ + BVH* __restrict__ bvh = (BVH*) This->ptr; + if (bvh->root == BVH::emptyNode) + return; + + // Only the coherent code path is implemented + assert(context->isCoherent()); + intersectCoherent(This, (RayHitK**)inputPackets, numOctantRays, context); + } + + template + template + __forceinline void BVHNIntersectorStream::intersectCoherent(Accel::Intersectors* __restrict__ This, + RayHitK** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(context->isCoherent()); + + BVH* __restrict__ bvh = (BVH*) This->ptr; + __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes + assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + + __aligned(64) TravRayKStream packets[MAX_INTERNAL_STREAM_SIZE/K]; + __aligned(64) Frustum frustum; + + bool commonOctant = true; + const size_t m_active = initPacketsAndFrustum((RayK**)inputPackets, numOctantRays, packets, frustum, commonOctant); + if (unlikely(m_active == 0)) return; + + /* case of non-common origin */ + if (unlikely(!commonOctant)) + { + const size_t numPackets = (numOctantRays+K-1)/K; + for (size_t i = 0; i < numPackets; i++) + This->intersect(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); + return; + } + + stack[0].mask = m_active; + stack[0].parent = 0; + stack[0].child = bvh->root; + + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + + StackItemMaskCoherent* stackPtr = stack + 1; + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + + STAT3(normal.trav_stack_pop,1,1,1); + stackPtr--; + /*! pop next node */ + NodeRef cur = NodeRef(stackPtr->child); + size_t m_trav_active = stackPtr->mask; + assert(m_trav_active); + NodeRef parent = stackPtr->parent; + + while (1) + { + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + parent = cur; + + __aligned(64) size_t maskK[N]; + for (size_t i = 0; i < N; i++) + maskK[i] = m_trav_active; + vfloat dist; + const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); + if (unlikely(m_node_hit == 0)) goto pop; + + BVHNNodeTraverserStreamHitCoherent::traverseClosestHit(cur, m_trav_active, vbool((int)m_node_hit), dist, (size_t*)maskK, stackPtr); + assert(m_trav_active); + } + + /* non-root and leaf => full culling test for all rays */ + if (unlikely(parent != 0 && cur.isLeaf())) + { + const AABBNode* __restrict__ const node = parent.getAABBNode(); + size_t boxID = 0xff; + for (size_t i = 0; i < N; i++) + if (node->child(i) == cur) { boxID = i; break; } + assert(boxID < N); + assert(cur == node->child(boxID)); + m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; PrimitiveK* prim = (PrimitiveK*)cur.leaf(num); + + size_t bits = m_trav_active; + + /*! intersect stream of rays with all primitives */ + size_t lazy_node = 0; +#if defined(__SSE4_2__) + STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif + while(bits) + { + size_t i = bsf(bits) / K; + const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); + assert(m_isec & bits); + bits &= ~m_isec; + + TravRayKStream& p = packets[i]; + vbool m_valid = p.tnear <= p.tfar; + PrimitiveIntersectorK::intersectK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); + p.tfar = min(p.tfar, inputPackets[i]->tfar); + }; + + } // traversal + intersection + } + + template + __forceinline void BVHNIntersectorStream::occluded(Accel::Intersectors* __restrict__ This, + RayN** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + /* we may traverse an empty BVH in case all geometry was invalid */ + BVH* __restrict__ bvh = (BVH*) This->ptr; + if (bvh->root == BVH::emptyNode) + return; + + if (unlikely(context->isCoherent())) + occludedCoherent(This, (RayK**)inputPackets, numOctantRays, context); + else + occludedIncoherent(This, (RayK**)inputPackets, numOctantRays, context); + } + + template + template + __noinline void BVHNIntersectorStream::occludedCoherent(Accel::Intersectors* __restrict__ This, + RayK** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(context->isCoherent()); + + BVH* __restrict__ bvh = (BVH*)This->ptr; + __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes + assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + + /* inactive rays should have been filtered out before */ + __aligned(64) TravRayKStream packets[MAX_INTERNAL_STREAM_SIZE/K]; + __aligned(64) Frustum frustum; + + bool commonOctant = true; + size_t m_active = initPacketsAndFrustum(inputPackets, numOctantRays, packets, frustum, commonOctant); + + /* valid rays */ + if (unlikely(m_active == 0)) return; + + /* case of non-common origin */ + if (unlikely(!commonOctant)) + { + const size_t numPackets = (numOctantRays+K-1)/K; + for (size_t i = 0; i < numPackets; i++) + This->occluded(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); + return; + } + + stack[0].mask = m_active; + stack[0].parent = 0; + stack[0].child = bvh->root; + + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// + + StackItemMaskCoherent* stackPtr = stack + 1; + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + + STAT3(normal.trav_stack_pop,1,1,1); + stackPtr--; + /*! pop next node */ + NodeRef cur = NodeRef(stackPtr->child); + size_t m_trav_active = stackPtr->mask & m_active; + if (unlikely(!m_trav_active)) continue; + assert(m_trav_active); + NodeRef parent = stackPtr->parent; + + while (1) + { + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + parent = cur; + + __aligned(64) size_t maskK[N]; + for (size_t i = 0; i < N; i++) + maskK[i] = m_trav_active; + + vfloat dist; + const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); + if (unlikely(m_node_hit == 0)) goto pop; + + BVHNNodeTraverserStreamHitCoherent::traverseAnyHit(cur, m_trav_active, vbool((int)m_node_hit), (size_t*)maskK, stackPtr); + assert(m_trav_active); + } + + /* non-root and leaf => full culling test for all rays */ + if (unlikely(parent != 0 && cur.isLeaf())) + { + const AABBNode* __restrict__ const node = parent.getAABBNode(); + size_t boxID = 0xff; + for (size_t i = 0; i < N; i++) + if (node->child(i) == cur) { boxID = i; break; } + assert(boxID < N); + assert(cur == node->child(boxID)); + m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves, 1, 1, 1); + size_t num; PrimitiveK* prim = (PrimitiveK*)cur.leaf(num); + + size_t bits = m_trav_active & m_active; + /*! intersect stream of rays with all primitives */ + size_t lazy_node = 0; +#if defined(__SSE4_2__) + STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif + while (bits) + { + size_t i = bsf(bits) / K; + const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); + assert(m_isec & bits); + bits &= ~m_isec; + TravRayKStream& p = packets[i]; + vbool m_valid = p.tnear <= p.tfar; + vbool m_hit = PrimitiveIntersectorK::occludedK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); + inputPackets[i]->tfar = select(m_hit & m_valid, vfloat(neg_inf), inputPackets[i]->tfar); + m_active &= ~((size_t)movemask(m_hit) << (i*K)); + } + + } // traversal + intersection + } + + + template + template + __forceinline void BVHNIntersectorStream::occludedIncoherent(Accel::Intersectors* __restrict__ This, + RayK** inputPackets, + size_t numOctantRays, + IntersectContext* context) + { + assert(!context->isCoherent()); + assert(types & BVH_FLAG_ALIGNED_NODE); + + __aligned(64) TravRayKStream packet[MAX_INTERNAL_STREAM_SIZE/K]; + + assert(numOctantRays <= 32); + const size_t numPackets = (numOctantRays+K-1)/K; + size_t m_active = 0; + for (size_t i = 0; i < numPackets; i++) + { + const vfloat tnear = inputPackets[i]->tnear(); + const vfloat tfar = inputPackets[i]->tfar; + vbool m_valid = (tnear <= tfar) & (tnear >= 0.0f); + m_active |= (size_t)movemask(m_valid) << (K*i); + const Vec3vf& org = inputPackets[i]->org; + const Vec3vf& dir = inputPackets[i]->dir; + vfloat packet_min_dist = max(tnear, 0.0f); + vfloat packet_max_dist = select(m_valid, tfar, neg_inf); + new (&packet[i]) TravRayKStream(org, dir, packet_min_dist, packet_max_dist); + } + + BVH* __restrict__ bvh = (BVH*)This->ptr; + + StackItemMaskT stack[stackSizeSingle]; // stack of nodes + StackItemMaskT* stackPtr = stack + 1; // current stack pointer + stack[0].ptr = bvh->root; + stack[0].mask = m_active; + + size_t terminated = ~m_active; + + /* near/far offsets based on first ray */ + const NearFarPrecalculations nf(Vec3fa(packet[0].rdir.x[0], packet[0].rdir.y[0], packet[0].rdir.z[0]), N); + + while (1) pop: + { + if (unlikely(stackPtr == stack)) break; + STAT3(shadow.trav_stack_pop,1,1,1); + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + size_t cur_mask = stackPtr->mask & (~terminated); + if (unlikely(cur_mask == 0)) continue; + + while (true) + { + /*! stop if we found a leaf node */ + if (unlikely(cur.isLeaf())) break; + const AABBNode* __restrict__ const node = cur.getAABBNode(); + + const vint vmask = traverseIncoherentStream(cur_mask, packet, node, nf, shiftTable); + + size_t mask = movemask(vmask != vint(zero)); + if (unlikely(mask == 0)) goto pop; + + __aligned(64) unsigned int child_mask[N]; + vint::storeu(child_mask, vmask); // this explicit store here causes much better code generation + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + assert(r < N); + cur = node->child(r); + BVHN::prefetch(cur,types); + cur_mask = child_mask[r]; + + /* simple in order sequence */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) continue; + stackPtr->ptr = cur; + stackPtr->mask = cur_mask; + stackPtr++; + + for (; ;) + { + r = bscf(mask); + assert(r < N); + + cur = node->child(r); + BVHN::prefetch(cur,types); + cur_mask = child_mask[r]; + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) break; + stackPtr->ptr = cur; + stackPtr->mask = cur_mask; + stackPtr++; + } + } + + /*! this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves,1,1,1); + size_t num; PrimitiveK* prim = (PrimitiveK*)cur.leaf(num); + + size_t bits = cur_mask; + size_t lazy_node = 0; + + for (; bits != 0;) + { + const size_t rayID = bscf(bits); + + RayK &ray = *inputPackets[rayID / K]; + const size_t k = rayID % K; + if (PrimitiveIntersectorK::occluded(This, ray, k, context, prim, num, lazy_node)) + { + ray.tfar[k] = neg_inf; + terminated |= (size_t)1 << rayID; + } + + /* lazy node */ + if (unlikely(lazy_node)) + { + stackPtr->ptr = lazy_node; + stackPtr->mask = cur_mask; + stackPtr++; + } + } + + if (unlikely(terminated == (size_t)-1)) break; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// ArrayIntersectorKStream Definitions + //////////////////////////////////////////////////////////////////////////////// + + template + struct Triangle4IntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Triangle4vIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Triangle4iIntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Triangle4iIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4vIntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4iIntersectorStreamMoeller { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4vIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + template + struct Quad4iIntersectorStreamPluecker { + template using Type = ArrayIntersectorKStream>; + }; + + struct ObjectIntersectorStream { + template using Type = ArrayIntersectorKStream>; + }; + + struct InstanceIntersectorStream { + template using Type = ArrayIntersectorKStream>; + }; + + // ===================================================================================================== + // ===================================================================================================== + // ===================================================================================================== + + template + void BVHNIntersectorStreamPacketFallback::intersect(Accel::Intersectors* __restrict__ This, + RayHitN** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + if (unlikely(context->isCoherent())) + intersectK(This, (RayHitK**)inputRays, numTotalRays, context); + else + intersectK(This, (RayHitK**)inputRays, numTotalRays, context); + } + + template + void BVHNIntersectorStreamPacketFallback::occluded(Accel::Intersectors* __restrict__ This, + RayN** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + if (unlikely(context->isCoherent())) + occludedK(This, (RayK**)inputRays, numTotalRays, context); + else + occludedK(This, (RayK**)inputRays, numTotalRays, context); + } + + template + template + __noinline void BVHNIntersectorStreamPacketFallback::intersectK(Accel::Intersectors* __restrict__ This, + RayHitK** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + /* fallback to packets */ + for (size_t i = 0; i < numTotalRays; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(numTotalRays)); + RayHitK& ray = *(inputRays[i / K]); + valid &= ray.tnear() <= ray.tfar; + This->intersect(valid, ray, context); + } + } + + template + template + __noinline void BVHNIntersectorStreamPacketFallback::occludedK(Accel::Intersectors* __restrict__ This, + RayK** inputRays, + size_t numTotalRays, + IntersectContext* context) + { + /* fallback to packets */ + for (size_t i = 0; i < numTotalRays; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(numTotalRays)); + RayK& ray = *(inputRays[i / K]); + valid &= ray.tnear() <= ray.tfar; + This->occluded(valid, ray, context); + } + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp new file mode 100644 index 0000000000..c3e5f137b8 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp @@ -0,0 +1,36 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.cpp" + +namespace embree +{ + namespace isa + { + + //////////////////////////////////////////////////////////////////////////////// + /// General BVHIntersectorStreamPacketFallback Intersector + //////////////////////////////////////////////////////////////////////////////// + + DEFINE_INTERSECTORN(BVH4IntersectorStreamPacketFallback,BVHNIntersectorStreamPacketFallback<4>); + + //////////////////////////////////////////////////////////////////////////////// + /// BVH4IntersectorStream Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4iIntersectorStreamMoeller>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4vIntersectorStreamPluecker>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4iIntersectorStreamPluecker>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller>)); + IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoellerNoFilter, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller>)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoellerNoFilter,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4iIntersectorStreamMoeller>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4vIntersectorStreamPluecker>)); + IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4iIntersectorStreamPluecker>)); + + IF_ENABLED_USER(DEFINE_INTERSECTORN(BVH4VirtualIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA ObjectIntersectorStream>)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTORN(BVH4InstanceIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA InstanceIntersectorStream>)); + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp new file mode 100644 index 0000000000..b858eb163f --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp @@ -0,0 +1,657 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream_filters.h" +#include "bvh_intersector_stream.h" + +namespace embree +{ + namespace isa + { + template + __noinline void RayStreamFilter::filterAOS(Scene* scene, void* _rayN, size_t N, size_t stride, IntersectContext* context) + { + RayStreamAOS rayN(_rayN); + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from AOS to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const vint offset = vij * int(stride); + const size_t packetIndex = j / K; + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to AOS */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const vint offset = vij * int(stride); + const size_t packetIndex = j / K; + rayN.setHitByOffset(valid, offset, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const Ray& ray = rayN.getRayByOffset(inputRayID * stride); + + /* skip invalid rays */ + if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = movemask(vfloat4(Vec3fa(ray.dir)) < 0.0f) & 0x7; + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayIDs = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayIDs[j] * int(stride); + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayIDs[j] * int(stride); + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(N)); + const vint offset = vi * int(stride); + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + + template + __noinline void RayStreamFilter::filterAOP(Scene* scene, void** _rayN, size_t N, IntersectContext* context) + { + RayStreamAOP rayN(_rayN); + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from AOP to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t packetIndex = j / K; + + RayTypeK ray = rayN.getRayByIndex(valid, vij); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to AOP */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t packetIndex = j / K; + + rayN.setHitByIndex(valid, vij, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const Ray& ray = rayN.getRayByIndex(inputRayID); + + /* skip invalid rays */ + if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = movemask(lt_mask(ray.dir,Vec3fa(0.0f))); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayIDs = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint index = *(vint*)&rayIDs[j]; + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByIndex(valid, index); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint index = *(vint*)&rayIDs[j]; + rayN.setHitByIndex(valid, index, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(N)); + + RayTypeK ray = rayN.getRayByIndex(valid, vi); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByIndex(valid, vi, ray); + } + } + } + + template + __noinline void RayStreamFilter::filterSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) + { + const size_t rayDataAlignment = (size_t)rayData % (K*sizeof(float)); + const size_t offsetAlignment = (size_t)stride % (K*sizeof(float)); + + /* fast path for packets with the correct width and data alignment */ + if (likely(N == K && + !rayDataAlignment && + !offsetAlignment)) + { + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + size_t packetIndex = 0; + for (size_t i = 0; i < numPackets; i++) + { + const size_t offset = i * stride; + RayTypeK& ray = *(RayTypeK*)(rayData + offset); + rayPtrs[packetIndex++] = &ray; + + /* trace as stream */ + if (unlikely(packetIndex == MAX_INTERNAL_STREAM_SIZE / K)) + { + const size_t size = packetIndex*K; + scene->intersectors.intersectN(rayPtrs, size, context); + packetIndex = 0; + } + } + + /* flush remaining packets */ + if (unlikely(packetIndex > 0)) + { + const size_t size = packetIndex*K; + scene->intersectors.intersectN(rayPtrs, size, context); + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + RayStreamSOA rayN(rayData, K); + + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N*numPackets;) + { + const size_t offset = (inputRayID / K) * stride + (inputRayID % K) * sizeof(float); + + /* skip invalid rays */ + if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays + #if defined(EMBREE_IGNORE_INVALID_RAYS) + __aligned(64) Ray ray = rayN.getRayByOffset(offset); + if (unlikely(!ray.valid())) { inputRayID++; continue; } + #endif + + const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayOffsets = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < numPackets; i++) + { + const size_t offset = i * stride; + RayTypeK& ray = *(RayTypeK*)(rayData + offset); + const vbool valid = ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + } + } + } + else + { + /* fallback to packets for arbitrary packet size and alignment */ + for (size_t i = 0; i < numPackets; i++) + { + const size_t offsetN = i * stride; + RayStreamSOA rayN(rayData + offsetN, N); + + for (size_t j = 0; j < N; j += K) + { + const size_t offset = j * sizeof(float); + vbool valid = (vint(int(j)) + vint(step)) < vint(int(N)); + RayTypeK ray = rayN.getRayByOffset(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + } + + template + __noinline void RayStreamFilter::filterSOP(Scene* scene, const void* _rayN, size_t N, IntersectContext* context) + { + RayStreamSOP& rayN = *(RayStreamSOP*)_rayN; + + /* use fast path for coherent ray mode */ + if (unlikely(context->isCoherent())) + { + __aligned(64) RayTypeK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayTypeK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) + { + const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + + /* convert from SOP to SOA */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t offset = (i+j) * sizeof(float); + const size_t packetIndex = j / K; + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + + rays[packetIndex] = ray; + rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN + } + + /* trace stream */ + scene->intersectors.intersectN(rayPtrs, size, context); + + /* convert from SOA to SOP */ + for (size_t j = 0; j < size; j += K) + { + const vint vij = vint(int(i+j)) + vint(step); + const vbool valid = vij < vint(int(N)); + const size_t offset = (i+j) * sizeof(float); + const size_t packetIndex = j / K; + + rayN.setHitByOffset(valid, offset, rays[packetIndex]); + } + } + } + else if (unlikely(!intersect)) + { + /* octant sorting for occlusion rays */ + __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; + __aligned(64) RayK rays[MAX_INTERNAL_STREAM_SIZE / K]; + __aligned(64) RayK* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + + unsigned int raysInOctant[8]; + for (unsigned int i = 0; i < 8; i++) + raysInOctant[i] = 0; + size_t inputRayID = 0; + + for (;;) + { + int curOctant = -1; + + /* sort rays into octants */ + for (; inputRayID < N;) + { + const size_t offset = inputRayID * sizeof(float); + /* skip invalid rays */ + if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) + __aligned(64) Ray ray = rayN.getRayByOffset(offset); + if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + + const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + + assert(octantID < 8); + octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; + inputRayID++; + if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) + { + curOctant = octantID; + break; + } + } + + /* need to flush rays in octant? */ + if (unlikely(curOctant == -1)) + { + for (unsigned int i = 0; i < 8; i++) + if (raysInOctant[i]) { curOctant = i; break; } + } + + /* all rays traced? */ + if (unlikely(curOctant == -1)) + break; + + unsigned int* const rayOffsets = &octants[curOctant][0]; + const unsigned int numOctantRays = raysInOctant[curOctant]; + assert(numOctantRays); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + RayK& ray = rays[j/K]; + rayPtrs[j/K] = &ray; + ray = rayN.getRayByOffset(valid, offset); + ray.tnear() = select(valid, ray.tnear(), zero); + ray.tfar = select(valid, ray.tfar, neg_inf); + } + + scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + + for (unsigned int j = 0; j < numOctantRays; j += K) + { + const vint vi = vint(int(j)) + vint(step); + const vbool valid = vi < vint(int(numOctantRays)); + const vint offset = *(vint*)&rayOffsets[j]; + rayN.setHitByOffset(valid, offset, rays[j/K]); + } + + raysInOctant[curOctant] = 0; + } + } + else + { + /* fallback to packets */ + for (size_t i = 0; i < N; i += K) + { + const vint vi = vint(int(i)) + vint(step); + vbool valid = vi < vint(int(N)); + const size_t offset = i * sizeof(float); + + RayTypeK ray = rayN.getRayByOffset(valid, offset); + valid &= ray.tnear() <= ray.tfar; + + scene->intersectors.intersect(valid, ray, context); + + rayN.setHitByOffset(valid, offset, ray); + } + } + } + + + void RayStreamFilter::intersectAOS(Scene* scene, RTCRayHit* _rayN, size_t N, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOS(scene, _rayN, N, stride, context); + else + filterAOS(scene, _rayN, N, stride, context); + } + + void RayStreamFilter::occludedAOS(Scene* scene, RTCRay* _rayN, size_t N, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOS(scene, _rayN, N, stride, context); + else + filterAOS(scene, _rayN, N, stride, context); + } + + void RayStreamFilter::intersectAOP(Scene* scene, RTCRayHit** _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOP(scene, (void**)_rayN, N, context); + else + filterAOP(scene, (void**)_rayN, N, context); + } + + void RayStreamFilter::occludedAOP(Scene* scene, RTCRay** _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterAOP(scene, (void**)_rayN, N, context); + else + filterAOP(scene, (void**)_rayN, N, context); + } + + void RayStreamFilter::intersectSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOA(scene, rayData, N, numPackets, stride, context); + else + filterSOA(scene, rayData, N, numPackets, stride, context); + } + + void RayStreamFilter::occludedSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOA(scene, rayData, N, numPackets, stride, context); + else + filterSOA(scene, rayData, N, numPackets, stride, context); + } + + void RayStreamFilter::intersectSOP(Scene* scene, const RTCRayHitNp* _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOP(scene, _rayN, N, context); + else + filterSOP(scene, _rayN, N, context); + } + + void RayStreamFilter::occludedSOP(Scene* scene, const RTCRayNp* _rayN, size_t N, IntersectContext* context) { + if (unlikely(context->isCoherent())) + filterSOP(scene, _rayN, N, context); + else + filterSOP(scene, _rayN, N, context); + } + + + RayStreamFilterFuncs rayStreamFilterFuncs() { + return RayStreamFilterFuncs(RayStreamFilter::intersectAOS, RayStreamFilter::intersectAOP, RayStreamFilter::intersectSOA, RayStreamFilter::intersectSOP, + RayStreamFilter::occludedAOS, RayStreamFilter::occludedAOP, RayStreamFilter::occludedSOA, RayStreamFilter::occludedSOP); + } + }; +}; diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h index 80a8ab2a56..2bf7e93587 100644 --- a/thirdparty/embree/kernels/config.h +++ b/thirdparty/embree/kernels/config.h @@ -16,7 +16,7 @@ /* #undef EMBREE_GEOMETRY_INSTANCE */ /* #undef EMBREE_GEOMETRY_GRID */ /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h index 10f315cee7..470e15f03e 100644 --- a/thirdparty/embree/kernels/hash.h +++ b/thirdparty/embree/kernels/hash.h @@ -2,4 +2,4 @@ // Copyright 2009-2020 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab" +#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd"