Occlusion culling fixes
Fixes some issues found by UBSAN and other misc things: * Fixed memory leak on exit. * Properly align ray packet buffer to 64 bytes. * Added some compiler flags from Embree's build system. * Fixed ray masks.
This commit is contained in:
parent
10801b90f9
commit
3f6ed10a5d
|
@ -86,6 +86,20 @@ if env["builtin_embree"]:
|
||||||
# Embree needs those, it will automatically use SSE2NEON in ARM
|
# Embree needs those, it will automatically use SSE2NEON in ARM
|
||||||
env_thirdparty.Append(CPPDEFINES=["__SSE2__", "__SSE__"])
|
env_thirdparty.Append(CPPDEFINES=["__SSE2__", "__SSE__"])
|
||||||
|
|
||||||
|
if not env.msvc:
|
||||||
|
env_thirdparty.Append(
|
||||||
|
CPPFLAGS=[
|
||||||
|
"-fno-strict-overflow",
|
||||||
|
"-fno-delete-null-pointer-checks",
|
||||||
|
"-fwrapv",
|
||||||
|
"-fsigned-char",
|
||||||
|
"-fno-strict-aliasing",
|
||||||
|
"-fno-tree-vectorize",
|
||||||
|
"-fvisibility=hidden",
|
||||||
|
"-fvisibility-inlines-hidden",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
env.modules_sources += thirdparty_obj
|
env.modules_sources += thirdparty_obj
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -41,9 +41,14 @@ RaycastOcclusionCull *RaycastOcclusionCull::raycast_singleton = nullptr;
|
||||||
void RaycastOcclusionCull::RaycastHZBuffer::clear() {
|
void RaycastOcclusionCull::RaycastHZBuffer::clear() {
|
||||||
HZBuffer::clear();
|
HZBuffer::clear();
|
||||||
|
|
||||||
camera_rays.clear();
|
if (camera_rays_unaligned_buffer) {
|
||||||
|
memfree(camera_rays_unaligned_buffer);
|
||||||
|
camera_rays_unaligned_buffer = nullptr;
|
||||||
|
camera_rays = nullptr;
|
||||||
|
}
|
||||||
camera_ray_masks.clear();
|
camera_ray_masks.clear();
|
||||||
packs_size = Size2i();
|
camera_rays_tile_count = 0;
|
||||||
|
tile_grid_size = Size2i();
|
||||||
}
|
}
|
||||||
|
|
||||||
void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) {
|
void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) {
|
||||||
|
@ -58,10 +63,19 @@ void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) {
|
||||||
|
|
||||||
HZBuffer::resize(p_size);
|
HZBuffer::resize(p_size);
|
||||||
|
|
||||||
packs_size = Size2i(Math::ceil(p_size.x / (float)TILE_SIZE), Math::ceil(p_size.y / (float)TILE_SIZE));
|
tile_grid_size = Size2i(Math::ceil(p_size.x / (float)TILE_SIZE), Math::ceil(p_size.y / (float)TILE_SIZE));
|
||||||
int ray_packets_count = packs_size.x * packs_size.y;
|
camera_rays_tile_count = tile_grid_size.x * tile_grid_size.y;
|
||||||
camera_rays.resize(ray_packets_count);
|
|
||||||
camera_ray_masks.resize(ray_packets_count * TILE_SIZE * TILE_SIZE);
|
if (camera_rays_unaligned_buffer) {
|
||||||
|
memfree(camera_rays_unaligned_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int alignment = 64; // Embree requires ray packets to be 64-aligned
|
||||||
|
camera_rays_unaligned_buffer = (uint8_t *)memalloc(camera_rays_tile_count * sizeof(CameraRayTile) + alignment);
|
||||||
|
camera_rays = (CameraRayTile *)(camera_rays_unaligned_buffer + alignment - (((uint64_t)camera_rays_unaligned_buffer) % alignment));
|
||||||
|
|
||||||
|
camera_ray_masks.resize(camera_rays_tile_count * TILE_RAYS);
|
||||||
|
memset(camera_ray_masks.ptr(), ~0, camera_rays_tile_count * TILE_RAYS * sizeof(uint32_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform3D &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool) {
|
void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform3D &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool) {
|
||||||
|
@ -96,65 +110,55 @@ void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform3D
|
||||||
}
|
}
|
||||||
|
|
||||||
void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data) {
|
void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data) {
|
||||||
uint32_t packs_total = camera_rays.size();
|
uint32_t total_tiles = camera_rays_tile_count;
|
||||||
uint32_t total_threads = p_data->thread_count;
|
uint32_t total_threads = p_data->thread_count;
|
||||||
uint32_t from = p_thread * packs_total / total_threads;
|
uint32_t from = p_thread * total_tiles / total_threads;
|
||||||
uint32_t to = (p_thread + 1 == total_threads) ? packs_total : ((p_thread + 1) * packs_total / total_threads);
|
uint32_t to = (p_thread + 1 == total_threads) ? total_tiles : ((p_thread + 1) * total_tiles / total_threads);
|
||||||
_generate_camera_rays(p_data, from, to);
|
_generate_camera_rays(p_data, from, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) {
|
void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) {
|
||||||
const Size2i &buffer_size = sizes[0];
|
const Size2i &buffer_size = sizes[0];
|
||||||
|
|
||||||
RayPacket *ray_packets = camera_rays.ptr();
|
|
||||||
uint32_t *ray_masks = camera_ray_masks.ptr();
|
|
||||||
|
|
||||||
for (int i = p_from; i < p_to; i++) {
|
for (int i = p_from; i < p_to; i++) {
|
||||||
RayPacket &packet = ray_packets[i];
|
CameraRayTile &tile = camera_rays[i];
|
||||||
int tile_x = (i % packs_size.x) * TILE_SIZE;
|
int tile_x = (i % tile_grid_size.x) * TILE_SIZE;
|
||||||
int tile_y = (i / packs_size.x) * TILE_SIZE;
|
int tile_y = (i / tile_grid_size.x) * TILE_SIZE;
|
||||||
|
|
||||||
for (int j = 0; j < TILE_RAYS; j++) {
|
for (int j = 0; j < TILE_RAYS; j++) {
|
||||||
int x = tile_x + j % TILE_SIZE;
|
int x = tile_x + j % TILE_SIZE;
|
||||||
int y = tile_y + j / TILE_SIZE;
|
int y = tile_y + j / TILE_SIZE;
|
||||||
|
|
||||||
if (x >= buffer_size.x || y >= buffer_size.y) {
|
|
||||||
ray_masks[i * TILE_RAYS + j] = 0U;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
ray_masks[i * TILE_RAYS + j] = ~0U;
|
|
||||||
|
|
||||||
float u = (float(x) + 0.5f) / buffer_size.x;
|
float u = (float(x) + 0.5f) / buffer_size.x;
|
||||||
float v = (float(y) + 0.5f) / buffer_size.y;
|
float v = (float(y) + 0.5f) / buffer_size.y;
|
||||||
Vector3 pixel_pos = p_data->pixel_corner + u * p_data->pixel_u_interp + v * p_data->pixel_v_interp;
|
Vector3 pixel_pos = p_data->pixel_corner + u * p_data->pixel_u_interp + v * p_data->pixel_v_interp;
|
||||||
|
|
||||||
packet.ray.tnear[j] = p_data->z_near;
|
tile.ray.tnear[j] = p_data->z_near;
|
||||||
|
|
||||||
Vector3 dir;
|
Vector3 dir;
|
||||||
if (p_data->camera_orthogonal) {
|
if (p_data->camera_orthogonal) {
|
||||||
dir = -p_data->camera_dir;
|
dir = -p_data->camera_dir;
|
||||||
packet.ray.org_x[j] = pixel_pos.x - dir.x * p_data->z_near;
|
tile.ray.org_x[j] = pixel_pos.x - dir.x * p_data->z_near;
|
||||||
packet.ray.org_y[j] = pixel_pos.y - dir.y * p_data->z_near;
|
tile.ray.org_y[j] = pixel_pos.y - dir.y * p_data->z_near;
|
||||||
packet.ray.org_z[j] = pixel_pos.z - dir.z * p_data->z_near;
|
tile.ray.org_z[j] = pixel_pos.z - dir.z * p_data->z_near;
|
||||||
} else {
|
} else {
|
||||||
dir = (pixel_pos - p_data->camera_pos).normalized();
|
dir = (pixel_pos - p_data->camera_pos).normalized();
|
||||||
packet.ray.org_x[j] = p_data->camera_pos.x;
|
tile.ray.org_x[j] = p_data->camera_pos.x;
|
||||||
packet.ray.org_y[j] = p_data->camera_pos.y;
|
tile.ray.org_y[j] = p_data->camera_pos.y;
|
||||||
packet.ray.org_z[j] = p_data->camera_pos.z;
|
tile.ray.org_z[j] = p_data->camera_pos.z;
|
||||||
packet.ray.tnear[j] /= dir.dot(p_data->camera_dir);
|
tile.ray.tnear[j] /= dir.dot(p_data->camera_dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
packet.ray.dir_x[j] = dir.x;
|
tile.ray.dir_x[j] = dir.x;
|
||||||
packet.ray.dir_y[j] = dir.y;
|
tile.ray.dir_y[j] = dir.y;
|
||||||
packet.ray.dir_z[j] = dir.z;
|
tile.ray.dir_z[j] = dir.z;
|
||||||
|
|
||||||
packet.ray.tfar[j] = p_data->z_far;
|
tile.ray.tfar[j] = p_data->z_far;
|
||||||
packet.ray.time[j] = 0.0f;
|
tile.ray.time[j] = 0.0f;
|
||||||
|
|
||||||
packet.ray.flags[j] = 0;
|
tile.ray.flags[j] = 0;
|
||||||
packet.ray.mask[j] = -1;
|
tile.ray.mask[j] = ~0U;
|
||||||
packet.hit.geomID[j] = RTC_INVALID_GEOMETRY_ID;
|
tile.hit.geomID[j] = RTC_INVALID_GEOMETRY_ID;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -163,8 +167,8 @@ void RaycastOcclusionCull::RaycastHZBuffer::sort_rays(const Vector3 &p_camera_di
|
||||||
ERR_FAIL_COND(is_empty());
|
ERR_FAIL_COND(is_empty());
|
||||||
|
|
||||||
Size2i buffer_size = sizes[0];
|
Size2i buffer_size = sizes[0];
|
||||||
for (int i = 0; i < packs_size.y; i++) {
|
for (int i = 0; i < tile_grid_size.y; i++) {
|
||||||
for (int j = 0; j < packs_size.x; j++) {
|
for (int j = 0; j < tile_grid_size.x; j++) {
|
||||||
for (int tile_i = 0; tile_i < TILE_SIZE; tile_i++) {
|
for (int tile_i = 0; tile_i < TILE_SIZE; tile_i++) {
|
||||||
for (int tile_j = 0; tile_j < TILE_SIZE; tile_j++) {
|
for (int tile_j = 0; tile_j < TILE_SIZE; tile_j++) {
|
||||||
int x = j * TILE_SIZE + tile_j;
|
int x = j * TILE_SIZE + tile_j;
|
||||||
|
@ -173,13 +177,13 @@ void RaycastOcclusionCull::RaycastHZBuffer::sort_rays(const Vector3 &p_camera_di
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int k = tile_i * TILE_SIZE + tile_j;
|
int k = tile_i * TILE_SIZE + tile_j;
|
||||||
int packet_index = i * packs_size.x + j;
|
int tile_index = i * tile_grid_size.x + j;
|
||||||
float d = camera_rays[packet_index].ray.tfar[k];
|
float d = camera_rays[tile_index].ray.tfar[k];
|
||||||
|
|
||||||
if (!p_orthogonal) {
|
if (!p_orthogonal) {
|
||||||
const float &dir_x = camera_rays[packet_index].ray.dir_x[k];
|
const float &dir_x = camera_rays[tile_index].ray.dir_x[k];
|
||||||
const float &dir_y = camera_rays[packet_index].ray.dir_y[k];
|
const float &dir_y = camera_rays[tile_index].ray.dir_y[k];
|
||||||
const float &dir_z = camera_rays[packet_index].ray.dir_z[k];
|
const float &dir_z = camera_rays[tile_index].ray.dir_z[k];
|
||||||
float cos_theta = p_camera_dir.x * dir_x + p_camera_dir.y * dir_y + p_camera_dir.z * dir_z;
|
float cos_theta = p_camera_dir.x * dir_x + p_camera_dir.y * dir_y + p_camera_dir.z * dir_z;
|
||||||
d *= cos_theta;
|
d *= cos_theta;
|
||||||
}
|
}
|
||||||
|
@ -191,6 +195,12 @@ void RaycastOcclusionCull::RaycastHZBuffer::sort_rays(const Vector3 &p_camera_di
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
RaycastOcclusionCull::RaycastHZBuffer::~RaycastHZBuffer() {
|
||||||
|
if (camera_rays_unaligned_buffer) {
|
||||||
|
memfree(camera_rays_unaligned_buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
|
|
||||||
bool RaycastOcclusionCull::is_occluder(RID p_rid) {
|
bool RaycastOcclusionCull::is_occluder(RID p_rid) {
|
||||||
|
@ -474,7 +484,7 @@ void RaycastOcclusionCull::Scenario::_raycast(uint32_t p_idx, const RaycastThrea
|
||||||
rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]);
|
rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void RaycastOcclusionCull::Scenario::raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const {
|
void RaycastOcclusionCull::Scenario::raycast(CameraRayTile *r_rays, const uint32_t *p_valid_masks, uint32_t p_tile_count, ThreadWorkPool &p_thread_pool) const {
|
||||||
ERR_FAIL_COND(singleton == nullptr);
|
ERR_FAIL_COND(singleton == nullptr);
|
||||||
if (raycast_singleton->ebr_device == nullptr) {
|
if (raycast_singleton->ebr_device == nullptr) {
|
||||||
return; // Embree is initialized on demand when there is some scenario with occluders in it.
|
return; // Embree is initialized on demand when there is some scenario with occluders in it.
|
||||||
|
@ -485,10 +495,10 @@ void RaycastOcclusionCull::Scenario::raycast(LocalVector<RayPacket> &r_rays, con
|
||||||
}
|
}
|
||||||
|
|
||||||
RaycastThreadData td;
|
RaycastThreadData td;
|
||||||
td.rays = r_rays.ptr();
|
td.rays = r_rays;
|
||||||
td.masks = p_valid_masks.ptr();
|
td.masks = p_valid_masks;
|
||||||
|
|
||||||
p_thread_pool.do_work(r_rays.size(), this, &Scenario::_raycast, &td);
|
p_thread_pool.do_work(p_tile_count, this, &Scenario::_raycast, &td);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
|
@ -536,7 +546,7 @@ void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform3D &p_cam_
|
||||||
|
|
||||||
buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal, p_thread_pool);
|
buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal, p_thread_pool);
|
||||||
|
|
||||||
scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks, p_thread_pool);
|
scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks.ptr(), buffer.camera_rays_tile_count, p_thread_pool);
|
||||||
buffer.sort_rays(-p_cam_transform.basis.get_axis(2), p_cam_orthogonal);
|
buffer.sort_rays(-p_cam_transform.basis.get_axis(2), p_cam_orthogonal);
|
||||||
buffer.update_mips();
|
buffer.update_mips();
|
||||||
}
|
}
|
||||||
|
@ -592,13 +602,15 @@ RaycastOcclusionCull::~RaycastOcclusionCull() {
|
||||||
scenario.commit_thread->wait_to_finish();
|
scenario.commit_thread->wait_to_finish();
|
||||||
memdelete(scenario.commit_thread);
|
memdelete(scenario.commit_thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
if (scenario.ebr_scene[i]) {
|
||||||
|
rtcReleaseScene(scenario.ebr_scene[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ebr_device != nullptr) {
|
if (ebr_device != nullptr) {
|
||||||
#ifdef __SSE2__
|
|
||||||
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
|
|
||||||
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
|
|
||||||
#endif
|
|
||||||
rtcReleaseDevice(ebr_device);
|
rtcReleaseDevice(ebr_device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -43,12 +43,12 @@
|
||||||
#include <embree3/rtcore.h>
|
#include <embree3/rtcore.h>
|
||||||
|
|
||||||
class RaycastOcclusionCull : public RendererSceneOcclusionCull {
|
class RaycastOcclusionCull : public RendererSceneOcclusionCull {
|
||||||
typedef RTCRayHit16 RayPacket;
|
typedef RTCRayHit16 CameraRayTile;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
class RaycastHZBuffer : public HZBuffer {
|
class RaycastHZBuffer : public HZBuffer {
|
||||||
private:
|
private:
|
||||||
Size2i packs_size;
|
Size2i tile_grid_size;
|
||||||
|
|
||||||
struct CameraRayThreadData {
|
struct CameraRayThreadData {
|
||||||
int thread_count;
|
int thread_count;
|
||||||
|
@ -67,7 +67,9 @@ public:
|
||||||
void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to);
|
void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
LocalVector<RayPacket> camera_rays;
|
unsigned int camera_rays_tile_count = 0;
|
||||||
|
uint8_t *camera_rays_unaligned_buffer = nullptr;
|
||||||
|
CameraRayTile *camera_rays = nullptr;
|
||||||
LocalVector<uint32_t> camera_ray_masks;
|
LocalVector<uint32_t> camera_ray_masks;
|
||||||
RID scenario_rid;
|
RID scenario_rid;
|
||||||
|
|
||||||
|
@ -75,6 +77,8 @@ public:
|
||||||
virtual void resize(const Size2i &p_size) override;
|
virtual void resize(const Size2i &p_size) override;
|
||||||
void sort_rays(const Vector3 &p_camera_dir, bool p_orthogonal);
|
void sort_rays(const Vector3 &p_camera_dir, bool p_orthogonal);
|
||||||
void update_camera_rays(const Transform3D &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool);
|
void update_camera_rays(const Transform3D &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool);
|
||||||
|
|
||||||
|
~RaycastHZBuffer();
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -111,7 +115,7 @@ private:
|
||||||
|
|
||||||
struct Scenario {
|
struct Scenario {
|
||||||
struct RaycastThreadData {
|
struct RaycastThreadData {
|
||||||
RayPacket *rays;
|
CameraRayTile *rays;
|
||||||
const uint32_t *masks;
|
const uint32_t *masks;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -144,7 +148,7 @@ private:
|
||||||
bool update(ThreadWorkPool &p_thread_pool);
|
bool update(ThreadWorkPool &p_thread_pool);
|
||||||
|
|
||||||
void _raycast(uint32_t p_thread, const RaycastThreadData *p_raycast_data) const;
|
void _raycast(uint32_t p_thread, const RaycastThreadData *p_raycast_data) const;
|
||||||
void raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const;
|
void raycast(CameraRayTile *r_rays, const uint32_t *p_valid_masks, uint32_t p_tile_count, ThreadWorkPool &p_thread_pool) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
static RaycastOcclusionCull *raycast_singleton;
|
static RaycastOcclusionCull *raycast_singleton;
|
||||||
|
|
Loading…
Reference in a new issue