// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "parallel_for.h" #include "../math/range.h" namespace embree { /* serial partitioning */ template __forceinline size_t serial_partitioning(T* array, const size_t begin, const size_t end, V& leftReduction, V& rightReduction, const IsLeft& is_left, const Reduction_T& reduction_t) { T* l = array + begin; T* r = array + end - 1; while(1) { /* *l < pivot */ while (likely(l <= r && is_left(*l) )) { //prefetchw(l+4); // FIXME: enable? reduction_t(leftReduction,*l); ++l; } /* *r >= pivot) */ while (likely(l <= r && !is_left(*r))) { //prefetchw(r-4); FIXME: enable? reduction_t(rightReduction,*r); --r; } if (r class __aligned(64) parallel_partition_task { ALIGNED_CLASS_(64); private: static const size_t MAX_TASKS = 64; T* array; size_t N; const IsLeft& is_left; const Reduction_T& reduction_t; const Reduction_V& reduction_v; const Vi& identity; size_t numTasks; __aligned(64) size_t counter_start[MAX_TASKS+1]; __aligned(64) size_t counter_left[MAX_TASKS+1]; __aligned(64) range leftMisplacedRanges[MAX_TASKS]; __aligned(64) range rightMisplacedRanges[MAX_TASKS]; __aligned(64) V leftReductions[MAX_TASKS]; __aligned(64) V rightReductions[MAX_TASKS]; public: __forceinline parallel_partition_task(T* array, const size_t N, const Vi& identity, const IsLeft& is_left, const Reduction_T& reduction_t, const Reduction_V& reduction_v, const size_t BLOCK_SIZE) : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} __forceinline const range* findStartRange(size_t& index, const range* const r, const size_t numRanges) { size_t i = 0; while(index >= (size_t)r[i].size()) { assert(i < numRanges); index -= (size_t)r[i].size(); i++; } return &r[i]; } __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, const size_t numRightMisplacedRanges, const size_t startID, const size_t endID) { size_t leftLocalIndex = startID; size_t rightLocalIndex = startID; const range* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); const range* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); size_t l_left = l_range->size() - leftLocalIndex; size_t r_left = r_range->size() - rightLocalIndex; T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; size_t size = endID - startID; size_t items = min(size,min(l_left,r_left)); while (size) { if (unlikely(l_left == 0)) { l_range++; l_left = l_range->size(); l = &array[l_range->begin()]; items = min(size,min(l_left,r_left)); } if (unlikely(r_left == 0)) { r_range++; r_left = r_range->size(); r = &array[r_range->begin()]; items = min(size,min(l_left,r_left)); } size -= items; l_left -= items; r_left -= items; while(items) { items--; xchg(*l++,*r++); } } } __forceinline size_t partition(V& leftReduction, V& rightReduction) { /* partition the individual ranges for each task */ parallel_for(numTasks,[&] (const size_t taskID) { const size_t startID = (taskID+0)*N/numTasks; const size_t endID = (taskID+1)*N/numTasks; V local_left(identity); V local_right(identity); const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); counter_start[taskID] = startID; counter_left [taskID] = mid-startID; leftReductions[taskID] = local_left; rightReductions[taskID] = local_right; }); counter_start[numTasks] = N; counter_left[numTasks] = 0; /* finalize the reductions */ for (size_t i=0; i globalLeft (0,mid); const range globalRight(mid,N); /* calculate all left and right ranges that are on the wrong global side */ size_t numMisplacedRangesLeft = 0; size_t numMisplacedRangesRight = 0; size_t numMisplacedItemsLeft = 0; size_t numMisplacedItemsRight = 0; for (size_t i=0; i left_range (counter_start[i], counter_start[i] + counter_left[i]); const range right_range(counter_start[i] + counter_left[i], counter_start[i+1]); const range left_misplaced = globalLeft. intersect(right_range); const range right_misplaced = globalRight.intersect(left_range); if (!left_misplaced.empty()) { numMisplacedItemsLeft += left_misplaced.size(); leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; } if (!right_misplaced.empty()) { numMisplacedItemsRight += right_misplaced.size(); rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; } } assert( numMisplacedItemsLeft == numMisplacedItemsRight ); /* if no items are misplaced we are done */ if (numMisplacedItemsLeft == 0) return mid; /* otherwise we copy the items to the right place in parallel */ parallel_for(numTasks,[&] (const size_t taskID) { const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); }); return mid; } }; template __noinline size_t parallel_partitioning(T* array, const size_t begin, const size_t end, const Vi &identity, V &leftReduction, V &rightReduction, const IsLeft& is_left, const Reduction_T& reduction_t, const Reduction_V& reduction_v, size_t BLOCK_SIZE = 128) { /* fall back to single threaded partitioning for small N */ if (unlikely(end-begin < BLOCK_SIZE)) return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); /* otherwise use parallel code */ else { typedef parallel_partition_task partition_task; std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); return begin+p->partition(leftReduction,rightReduction); } } template __noinline size_t parallel_partitioning(T* array, const size_t begin, const size_t end, const Vi &identity, V &leftReduction, V &rightReduction, const IsLeft& is_left, const Reduction_T& reduction_t, const Reduction_V& reduction_v, size_t BLOCK_SIZE, size_t PARALLEL_THRESHOLD) { /* fall back to single threaded partitioning for small N */ if (unlikely(end-begin < PARALLEL_THRESHOLD)) return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); /* otherwise use parallel code */ else { typedef parallel_partition_task partition_task; std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); return begin+p->partition(leftReduction,rightReduction); } } template inline size_t parallel_partitioning(T* array, const size_t begin, const size_t end, const IsLeft& is_left, size_t BLOCK_SIZE = 128) { size_t leftReduction = 0; size_t rightReduction = 0; return parallel_partitioning( array,begin,end,0,leftReduction,rightReduction,is_left, [] (size_t& t,const T& ref) { }, [] (size_t& t0,size_t& t1) { }, BLOCK_SIZE); } }