godot/thirdparty/embree/common/algorithms/parallel_filter.h
jfons 767e374dce Upgrade Embree to the latest official release.
Since Embree v3.13.0 supports AARCH64, switch back to the
official repo instead of using Embree-aarch64.

`thirdparty/embree/patches/godot-changes.patch` should now contain
an accurate diff of the changes done to the library.
2021-05-21 17:00:24 +02:00

94 lines
2.8 KiB
C++

// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
namespace embree
{
template<typename Ty, typename Index, typename Predicate>
inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
{
Index j = first;
for (Index i=first; i<last; i++)
if (predicate(data[i]))
data[j++] = data[i];
return j;
}
template<typename Ty, typename Index, typename Predicate>
inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
{
/* sequential fallback */
if (end-begin <= minStepSize)
return sequential_filter(data,begin,end,predicate);
/* calculate number of tasks to use */
enum { MAX_TASKS = 64 };
const Index numThreads = TaskScheduler::threadCount();
const Index numBlocks = (end-begin+minStepSize-1)/minStepSize;
const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS);
/* filter blocks */
Index nused[MAX_TASKS];
Index nfree[MAX_TASKS];
parallel_for(taskCount, [&](const Index taskIndex)
{
const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
const Index i2 = sequential_filter(data,i0,i1,predicate);
nused[taskIndex] = i2-i0;
nfree[taskIndex] = i1-i2;
});
/* calculate offsets */
Index sused=0;
Index sfree=0;
Index pfree[MAX_TASKS];
for (Index i=0; i<taskCount; i++)
{
sused+=nused[i];
Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
}
/* return if we did not filter out any element */
assert(sfree <= end-begin);
assert(sused <= end-begin);
if (sused == end-begin)
return end;
/* otherwise we have to copy misplaced elements around */
parallel_for(taskCount, [&](const Index taskIndex)
{
/* destination to write elements to */
Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
Index dst_end = min(dst+nfree[taskIndex],begin+sused);
if (dst_end <= dst) return;
/* range of misplaced elements to copy to destination */
Index r0 = pfree[taskIndex];
Index r1 = r0+dst_end-dst;
/* find range in misplaced elements in back to front order */
Index k0=0;
for (Index i=taskCount-1; i>0; i--)
{
if (k0 > r1) break;
Index k1 = k0+nused[i];
Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
for (Index i=max(r0,k0); i<min(r1,k1); i++) {
Index isrc = src-i+k0-1;
assert(dst >= begin && dst < end);
assert(isrc >= begin && isrc < end);
data[dst++] = data[isrc];
}
k0 = k1;
}
});
return begin+sused;
}
}