Merge pull request #48050 from JFonS/occlusion_culling

2021-04-27 19:07:12 +02:00 · 2021-04-27 19:07:12 +02:00 · 95cfce661b
parent b999fbc4bd 4d9d99bb82
commit 95cfce661b
385 changed files with 95994 additions and 47 deletions
--- a/core/math/math_funcs.h
+++ b/core/math/math_funcs.h
@ -103,6 +103,9 @@ public:
 	static _ALWAYS_INLINE_ double log(double p_x) { return ::log(p_x); }
 	static _ALWAYS_INLINE_ float log(float p_x) { return ::logf(p_x); }

+	static _ALWAYS_INLINE_ double log2(double p_x) { return ::log2(p_x); }
+	static _ALWAYS_INLINE_ float log2(float p_x) { return ::log2f(p_x); }
+
 	static _ALWAYS_INLINE_ double exp(double p_x) { return ::exp(p_x); }
 	static _ALWAYS_INLINE_ float exp(float p_x) { return ::expf(p_x); }

--- a/doc/classes/GeometryInstance3D.xml
+++ b/doc/classes/GeometryInstance3D.xml
@ -48,6 +48,8 @@
 		</member>
 		<member name="gi_mode" type="int" setter="set_gi_mode" getter="get_gi_mode" enum="GeometryInstance3D.GIMode" default="0">
 		</member>
+		<member name="ignore_occlusion_culling" type="bool" setter="set_ignore_occlusion_culling" getter="is_ignoring_occlusion_culling" default="false">
+		</member>
 		<member name="lod_bias" type="float" setter="set_lod_bias" getter="get_lod_bias" default="1.0">
 		</member>
 		<member name="lod_max_distance" type="float" setter="set_lod_max_distance" getter="get_lod_max_distance" default="0.0">
--- a/doc/classes/Occluder3D.xml
+++ b/doc/classes/Occluder3D.xml
@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="Occluder3D" inherits="Resource" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+	</methods>
+	<members>
+		<member name="indices" type="PackedInt32Array" setter="set_indices" getter="get_indices" default="PackedInt32Array(  )">
+		</member>
+		<member name="vertices" type="PackedVector3Array" setter="set_vertices" getter="get_vertices" default="PackedVector3Array(  )">
+		</member>
+	</members>
+	<constants>
+	</constants>
+</class>
--- a/doc/classes/OccluderInstance3D.xml
+++ b/doc/classes/OccluderInstance3D.xml
@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="OccluderInstance3D" inherits="Node3D" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+		<method name="get_bake_mask_bit" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="layer" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="set_bake_mask_bit">
+			<return type="void">
+			</return>
+			<argument index="0" name="layer" type="int">
+			</argument>
+			<argument index="1" name="enabled" type="bool">
+			</argument>
+			<description>
+			</description>
+		</method>
+	</methods>
+	<members>
+		<member name="bake_mask" type="int" setter="set_bake_mask" getter="get_bake_mask" default="4294967295">
+		</member>
+		<member name="occluder" type="Occluder3D" setter="set_occluder" getter="get_occluder">
+		</member>
+	</members>
+	<constants>
+	</constants>
+</class>
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@ -1469,6 +1469,12 @@
 		</member>
 		<member name="rendering/mesh_lod/lod_change/threshold_pixels" type="float" setter="" getter="" default="1.0">
 		</member>
+		<member name="rendering/occlusion_culling/bvh_build_quality" type="int" setter="" getter="" default="2">
+		</member>
+		<member name="rendering/occlusion_culling/occlusion_rays_per_thread" type="int" setter="" getter="" default="512">
+		</member>
+		<member name="rendering/occlusion_culling/use_occlusion_culling" type="bool" setter="" getter="" default="false">
+		</member>
 		<member name="rendering/reflections/reflection_atlas/reflection_count" type="int" setter="" getter="" default="64">
 			Number of cubemaps to store in the reflection atlas. The number of [ReflectionProbe]s in a scene will be limited by this amount. A higher number requires more VRAM.
 		</member>
--- a/doc/classes/RenderingServer.xml
+++ b/doc/classes/RenderingServer.xml
@ -1999,6 +1999,24 @@
 				Sets the number of instances visible at a given time. If -1, all instances that have been allocated are drawn. Equivalent to [member MultiMesh.visible_instance_count].
 			</description>
 		</method>
+		<method name="occluder_create">
+			<return type="RID">
+			</return>
+			<description>
+			</description>
+		</method>
+		<method name="occluder_set_mesh">
+			<return type="void">
+			</return>
+			<argument index="0" name="arg0" type="RID">
+			</argument>
+			<argument index="1" name="arg1" type="PackedVector3Array">
+			</argument>
+			<argument index="2" name="arg2" type="PackedInt32Array">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="omni_light_create">
 			<return type="RID">
 			</return>
@ -2412,6 +2430,16 @@
 				The scenario is the 3D world that all the visual instances exist in.
 			</description>
 		</method>
+		<method name="scenario_set_camera_effects">
+			<return type="void">
+			</return>
+			<argument index="0" name="scenario" type="RID">
+			</argument>
+			<argument index="1" name="effects" type="RID">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="scenario_set_debug">
 			<return type="void">
 			</return>
@ -2897,6 +2925,22 @@
 				Sets the anti-aliasing mode. See [enum ViewportMSAA] for options.
 			</description>
 		</method>
+		<method name="viewport_set_occlusion_culling_build_quality">
+			<return type="void">
+			</return>
+			<argument index="0" name="quality" type="int" enum="RenderingServer.ViewportOcclusionCullingBuildQuality">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="viewport_set_occlusion_rays_per_thread">
+			<return type="void">
+			</return>
+			<argument index="0" name="rays_per_thread" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="viewport_set_parent_viewport">
 			<return type="void">
 			</return>
@ -3002,6 +3046,16 @@
 			<description>
 			</description>
 		</method>
+		<method name="viewport_set_use_occlusion_culling">
+			<return type="void">
+			</return>
+			<argument index="0" name="viewport" type="RID">
+			</argument>
+			<argument index="1" name="enable" type="bool">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="viewport_set_use_xr">
 			<return type="void">
 			</return>
@ -3454,6 +3508,8 @@
 		</constant>
 		<constant name="VIEWPORT_DEBUG_DRAW_GI_BUFFER" value="17" enum="ViewportDebugDraw">
 		</constant>
+		<constant name="VIEWPORT_DEBUG_DRAW_OCCLUDERS" value="23" enum="ViewportDebugDraw">
+		</constant>
 		<constant name="SKY_MODE_QUALITY" value="1" enum="SkyMode">
 			Uses high quality importance sampling to process the radiance map. In general, this results in much higher quality than [constant Sky.PROCESS_MODE_REALTIME] but takes much longer to generate. This should not be used if you plan on changing the sky at runtime. If you are finding that the reflection is not blurry enough and is showing sparkles or fireflies, try increasing [member ProjectSettings.rendering/reflections/sky_reflections/ggx_samples].
 		</constant>
@ -3606,6 +3662,12 @@
 		<constant name="SCENARIO_DEBUG_SHADELESS" value="3" enum="ScenarioDebugMode">
 			Draw all objects without shading. Equivalent to setting all objects shaders to [code]unshaded[/code].
 		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW" value="0" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM" value="1" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH" value="2" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
 		<constant name="INSTANCE_NONE" value="0" enum="InstanceType">
 			The instance does not have a type.
 		</constant>
@ -3638,7 +3700,9 @@
 		<constant name="INSTANCE_LIGHTMAP" value="10" enum="InstanceType">
 			The instance is a lightmap.
 		</constant>
-		<constant name="INSTANCE_MAX" value="11" enum="InstanceType">
+		<constant name="INSTANCE_OCCLUDER" value="11" enum="InstanceType">
+		</constant>
+		<constant name="INSTANCE_MAX" value="12" enum="InstanceType">
 			Represents the size of the [enum InstanceType] enum.
 		</constant>
 		<constant name="INSTANCE_GEOMETRY_MASK" value="30" enum="InstanceType">
@ -3653,7 +3717,9 @@
 		<constant name="INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE" value="2" enum="InstanceFlags">
 			When set, manually requests to draw geometry on next frame.
 		</constant>
-		<constant name="INSTANCE_FLAG_MAX" value="3" enum="InstanceFlags">
+		<constant name="INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING" value="3" enum="InstanceFlags">
+		</constant>
+		<constant name="INSTANCE_FLAG_MAX" value="4" enum="InstanceFlags">
 			Represents the size of the [enum InstanceFlags] enum.
 		</constant>
 		<constant name="SHADOW_CASTING_SETTING_OFF" value="0" enum="ShadowCastingSetting">
--- a/doc/classes/Viewport.xml
+++ b/doc/classes/Viewport.xml
@ -267,6 +267,8 @@
 		</member>
 		<member name="use_debanding" type="bool" setter="set_use_debanding" getter="is_using_debanding" default="false">
 		</member>
+		<member name="use_occlusion_culling" type="bool" setter="set_use_occlusion_culling" getter="is_using_occlusion_culling" default="false">
+		</member>
 		<member name="world_2d" type="World2D" setter="set_world_2d" getter="get_world_2d">
 			The custom [World2D] which can be used as 2D environment source.
 		</member>
@ -419,6 +421,8 @@
 		</constant>
 		<constant name="DEBUG_DRAW_CLUSTER_REFLECTION_PROBES" value="22" enum="DebugDraw">
 		</constant>
+		<constant name="DEBUG_DRAW_OCCLUDERS" value="23" enum="DebugDraw">
+		</constant>
 		<constant name="DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST" value="0" enum="DefaultCanvasItemTextureFilter">
 			The texture filter reads from the nearest pixel only. The simplest and fastest method of filtering, but the texture will look pixelized.
 		</constant>
--- a/drivers/dummy/rasterizer_dummy.h
+++ b/drivers/dummy/rasterizer_dummy.h
@ -548,6 +548,12 @@ public:
 	void lightmap_set_probe_capture_update_speed(float p_speed) override {}
 	float lightmap_get_probe_capture_update_speed() const override { return 0; }

+	/* OCCLUDER */
+
+	RID occluder_allocate() override { return RID(); }
+	void occluder_initialize(RID p_rid) override {}
+	void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {}
+
 	/* PARTICLES */

 	RID particles_allocate() override { return RID(); }
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@ -143,6 +143,7 @@
 #include "editor/plugins/multimesh_editor_plugin.h"
 #include "editor/plugins/navigation_polygon_editor_plugin.h"
 #include "editor/plugins/node_3d_editor_plugin.h"
+#include "editor/plugins/occluder_instance_3d_editor_plugin.h"
 #include "editor/plugins/ot_features_plugin.h"
 #include "editor/plugins/packed_scene_translation_parser_plugin.h"
 #include "editor/plugins/path_2d_editor_plugin.h"
@ -6800,6 +6801,7 @@ EditorNode::EditorNode() {
 	add_editor_plugin(memnew(TextureRegionEditorPlugin(this)));
 	add_editor_plugin(memnew(GIProbeEditorPlugin(this)));
 	add_editor_plugin(memnew(BakedLightmapEditorPlugin(this)));
+	add_editor_plugin(memnew(OccluderInstance3DEditorPlugin(this)));
 	add_editor_plugin(memnew(Path2DEditorPlugin(this)));
 	add_editor_plugin(memnew(Path3DEditorPlugin(this)));
 	add_editor_plugin(memnew(Line2DEditorPlugin(this)));
--- a/editor/node_3d_editor_gizmos.cpp
+++ b/editor/node_3d_editor_gizmos.cpp
@ -47,6 +47,7 @@
 #include "scene/3d/listener_3d.h"
 #include "scene/3d/mesh_instance_3d.h"
 #include "scene/3d/navigation_region_3d.h"
+#include "scene/3d/occluder_instance_3d.h"
 #include "scene/3d/physics_joint_3d.h"
 #include "scene/3d/position_3d.h"
 #include "scene/3d/ray_cast_3d.h"
@ -176,6 +177,7 @@ void EditorNode3DGizmo::Instance::create_instance(Node3D *p_base, bool p_hidden)
 	RS::get_singleton()->instance_geometry_set_cast_shadows_setting(instance, RS::SHADOW_CASTING_SETTING_OFF);
 	int layer = p_hidden ? 0 : 1 << Node3DEditorViewport::GIZMO_EDIT_LAYER;
 	RS::get_singleton()->instance_set_layer_mask(instance, layer); //gizmos are 26
+	RS::get_singleton()->instance_geometry_set_flag(instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 }

 void EditorNode3DGizmo::add_mesh(const Ref<ArrayMesh> &p_mesh, bool p_billboard, const Ref<SkinReference> &p_skin_reference, const Ref<Material> &p_material) {
@ -1464,6 +1466,44 @@ void MeshInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
 }

 /////
+
+OccluderInstance3DGizmoPlugin::OccluderInstance3DGizmoPlugin() {
+	create_material("line_material", EDITOR_DEF("editors/3d_gizmos/gizmo_colors/occluder", Color(0.8, 0.5, 1)));
+}
+
+bool OccluderInstance3DGizmoPlugin::has_gizmo(Node3D *p_spatial) {
+	return Object::cast_to<OccluderInstance3D>(p_spatial) != nullptr;
+}
+
+String OccluderInstance3DGizmoPlugin::get_gizmo_name() const {
+	return "OccluderInstance3D";
+}
+
+int OccluderInstance3DGizmoPlugin::get_priority() const {
+	return -1;
+}
+
+void OccluderInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
+	OccluderInstance3D *occluder_instance = Object::cast_to<OccluderInstance3D>(p_gizmo->get_spatial_node());
+
+	p_gizmo->clear();
+
+	Ref<Occluder3D> o = occluder_instance->get_occluder();
+
+	if (!o.is_valid()) {
+		return;
+	}
+
+	Vector<Vector3> lines = o->get_debug_lines();
+	if (!lines.is_empty()) {
+		Ref<Material> material = get_material("line_material", p_gizmo);
+		p_gizmo->add_lines(lines, material);
+		p_gizmo->add_collision_segments(lines);
+	}
+}
+
+/////
+
 Sprite3DGizmoPlugin::Sprite3DGizmoPlugin() {
 }

--- a/editor/node_3d_editor_gizmos.h
+++ b/editor/node_3d_editor_gizmos.h
@ -100,6 +100,18 @@ public:
 	MeshInstance3DGizmoPlugin();
 };

+class OccluderInstance3DGizmoPlugin : public EditorNode3DGizmoPlugin {
+	GDCLASS(OccluderInstance3DGizmoPlugin, EditorNode3DGizmoPlugin);
+
+public:
+	bool has_gizmo(Node3D *p_spatial) override;
+	String get_gizmo_name() const override;
+	int get_priority() const override;
+	void redraw(EditorNode3DGizmo *p_gizmo) override;
+
+	OccluderInstance3DGizmoPlugin();
+};
+
 class Sprite3DGizmoPlugin : public EditorNode3DGizmoPlugin {
 	GDCLASS(Sprite3DGizmoPlugin, EditorNode3DGizmoPlugin);

--- a/editor/plugins/node_3d_editor_plugin.cpp
+++ b/editor/plugins/node_3d_editor_plugin.cpp
@ -2368,6 +2368,9 @@ void Node3DEditorViewport::_project_settings_changed() {
 	viewport->set_screen_space_aa(Viewport::ScreenSpaceAA(ssaa_mode));
 	const bool use_debanding = GLOBAL_GET("rendering/anti_aliasing/quality/use_debanding");
 	viewport->set_use_debanding(use_debanding);
+
+	const bool use_occlusion_culling = GLOBAL_GET("rendering/occlusion_culling/use_occlusion_culling");
+	viewport->set_use_occlusion_culling(use_occlusion_culling);
 }

 void Node3DEditorViewport::_notification(int p_what) {
@ -3071,7 +3074,8 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 		case VIEW_DISPLAY_DEBUG_CLUSTER_OMNI_LIGHTS:
 		case VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS:
 		case VIEW_DISPLAY_DEBUG_CLUSTER_DECALS:
-		case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES: {
+		case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES:
+		case VIEW_DISPLAY_DEBUG_OCCLUDERS: {
 			static const int display_options[] = {
 				VIEW_DISPLAY_NORMAL,
 				VIEW_DISPLAY_WIREFRAME,
@ -3097,6 +3101,7 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 				VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS,
 				VIEW_DISPLAY_DEBUG_CLUSTER_DECALS,
 				VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES,
+				VIEW_DISPLAY_DEBUG_OCCLUDERS,
 				VIEW_MAX
 			};
 			static const Viewport::DebugDraw debug_draw_modes[] = {
@ -3124,6 +3129,7 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 				Viewport::DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 				Viewport::DEBUG_DRAW_CLUSTER_DECALS,
 				Viewport::DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+				Viewport::DEBUG_DRAW_OCCLUDERS,
 			};

 			int idx = 0;
@ -3173,6 +3179,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(move_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(move_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(move_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);

 		move_plane_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(move_plane_gizmo_instance[i], spatial_editor->get_move_plane_gizmo(i)->get_rid());
@ -3180,6 +3187,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(move_plane_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(move_plane_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(move_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);

 		rotate_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(rotate_gizmo_instance[i], spatial_editor->get_rotate_gizmo(i)->get_rid());
@ -3187,6 +3195,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);

 		scale_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(scale_gizmo_instance[i], spatial_editor->get_scale_gizmo(i)->get_rid());
@ -3194,6 +3203,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(scale_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(scale_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(scale_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);

 		scale_plane_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(scale_plane_gizmo_instance[i], spatial_editor->get_scale_plane_gizmo(i)->get_rid());
@ -3201,6 +3211,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(scale_plane_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(scale_plane_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(scale_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	}

 	// Rotation white outline
@ -3210,6 +3221,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 	RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[3], false);
 	RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[3], RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[3], layer);
+	RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[3], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 }

 void Node3DEditorViewport::_finish_gizmo_instances() {
@ -4043,6 +4055,7 @@ Node3DEditorViewport::Node3DEditorViewport(Node3DEditor *p_spatial_editor, Edito
 	display_submenu->add_radio_check_item(TTR("Spot Light Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS);
 	display_submenu->add_radio_check_item(TTR("Decal Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_DECALS);
 	display_submenu->add_radio_check_item(TTR("Reflection Probe Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES);
+	display_submenu->add_radio_check_item(TTR("Occlusion Culling Buffer"), VIEW_DISPLAY_DEBUG_OCCLUDERS);

 	display_submenu->set_name("display_advanced");
 	view_menu->get_popup()->add_submenu_item(TTR("Display Advanced..."), "display_advanced", VIEW_DISPLAY_ADVANCED);
@ -4625,6 +4638,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) {
 			si->sbox_instance,
 			RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(si->sbox_instance, 1 << Node3DEditorViewport::MISC_TOOL_LAYER);
+	RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	si->sbox_instance_xray = RenderingServer::get_singleton()->instance_create2(
 			selection_box_xray->get_rid(),
 			sp->get_world_3d()->get_scenario());
@ -4632,6 +4646,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) {
 			si->sbox_instance_xray,
 			RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(si->sbox_instance_xray, 1 << Node3DEditorViewport::MISC_TOOL_LAYER);
+	RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);

 	return si;
 }
@ -5403,6 +5418,7 @@ void Node3DEditor::_init_indicators() {

 		origin_instance = RenderingServer::get_singleton()->instance_create2(origin, get_tree()->get_root()->get_world_3d()->get_scenario());
 		RS::get_singleton()->instance_set_layer_mask(origin_instance, 1 << Node3DEditorViewport::GIZMO_GRID_LAYER);
+		RS::get_singleton()->instance_geometry_set_flag(origin_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);

 		RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(origin_instance, RS::SHADOW_CASTING_SETTING_OFF);
 	}
@ -5964,6 +5980,7 @@ void Node3DEditor::_init_grid() {
 		RenderingServer::get_singleton()->instance_set_visible(grid_instance[c], grid_visible[a]);
 		RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(grid_instance[c], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(grid_instance[c], 1 << Node3DEditorViewport::GIZMO_GRID_LAYER);
+		RS::get_singleton()->instance_geometry_set_flag(grid_instance[c], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	}
 }

@ -6465,6 +6482,7 @@ void Node3DEditor::_register_all_gizmos() {
 	add_gizmo_plugin(Ref<Light3DGizmoPlugin>(memnew(Light3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<AudioStreamPlayer3DGizmoPlugin>(memnew(AudioStreamPlayer3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<MeshInstance3DGizmoPlugin>(memnew(MeshInstance3DGizmoPlugin)));
+	add_gizmo_plugin(Ref<OccluderInstance3DGizmoPlugin>(memnew(OccluderInstance3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<SoftBody3DGizmoPlugin>(memnew(SoftBody3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<Sprite3DGizmoPlugin>(memnew(Sprite3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<Skeleton3DGizmoPlugin>(memnew(Skeleton3DGizmoPlugin)));
@ -7340,6 +7358,7 @@ void EditorNode3DGizmoPlugin::create_material(const String &p_name, const Color
 		material->set_shading_mode(StandardMaterial3D::SHADING_MODE_UNSHADED);
 		material->set_transparency(StandardMaterial3D::TRANSPARENCY_ALPHA);
 		material->set_render_priority(StandardMaterial3D::RENDER_PRIORITY_MIN + 1);
+		material->set_cull_mode(StandardMaterial3D::CULL_DISABLED);

 		if (p_use_vertex_color) {
 			material->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
--- a/editor/plugins/node_3d_editor_plugin.h
+++ b/editor/plugins/node_3d_editor_plugin.h
@ -221,6 +221,7 @@ class Node3DEditorViewport : public Control {
 		VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS,
 		VIEW_DISPLAY_DEBUG_CLUSTER_DECALS,
 		VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES,
+		VIEW_DISPLAY_DEBUG_OCCLUDERS,

 		VIEW_LOCK_ROTATION,
 		VIEW_CINEMATIC_PREVIEW,
--- a/editor/plugins/occluder_instance_3d_editor_plugin.cpp
+++ b/editor/plugins/occluder_instance_3d_editor_plugin.cpp
@ -0,0 +1,117 @@
+/*************************************************************************/
+/*  occluder_instance_3d_editor_plugin.cpp                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "occluder_instance_3d_editor_plugin.h"
+
+void OccluderInstance3DEditorPlugin::_bake_select_file(const String &p_file) {
+	if (occluder_instance) {
+		OccluderInstance3D::BakeError err;
+		if (get_tree()->get_edited_scene_root() && get_tree()->get_edited_scene_root() == occluder_instance) {
+			err = occluder_instance->bake(occluder_instance, p_file);
+		} else {
+			err = occluder_instance->bake(occluder_instance->get_parent(), p_file);
+		}
+
+		switch (err) {
+			case OccluderInstance3D::BAKE_ERROR_NO_SAVE_PATH: {
+				String scene_path = occluder_instance->get_filename();
+				if (scene_path == String()) {
+					scene_path = occluder_instance->get_owner()->get_filename();
+				}
+				if (scene_path == String()) {
+					EditorNode::get_singleton()->show_warning(TTR("Can't determine a save path for the occluder.\nSave your scene and try again."));
+					break;
+				}
+				scene_path = scene_path.get_basename() + ".occ";
+
+				file_dialog->set_current_path(scene_path);
+				file_dialog->popup_file_dialog();
+
+			} break;
+			case OccluderInstance3D::BAKE_ERROR_NO_MESHES: {
+				EditorNode::get_singleton()->show_warning(TTR("No meshes to bake."));
+				break;
+			}
+			default: {
+			}
+		}
+	}
+}
+
+void OccluderInstance3DEditorPlugin::_bake() {
+	_bake_select_file("");
+}
+
+void OccluderInstance3DEditorPlugin::edit(Object *p_object) {
+	OccluderInstance3D *s = Object::cast_to<OccluderInstance3D>(p_object);
+	if (!s) {
+		return;
+	}
+
+	occluder_instance = s;
+}
+
+bool OccluderInstance3DEditorPlugin::handles(Object *p_object) const {
+	return p_object->is_class("OccluderInstance3D");
+}
+
+void OccluderInstance3DEditorPlugin::make_visible(bool p_visible) {
+	if (p_visible) {
+		bake->show();
+	} else {
+		bake->hide();
+	}
+}
+
+void OccluderInstance3DEditorPlugin::_bind_methods() {
+	ClassDB::bind_method("_bake", &OccluderInstance3DEditorPlugin::_bake);
+}
+
+OccluderInstance3DEditorPlugin::OccluderInstance3DEditorPlugin(EditorNode *p_node) {
+	editor = p_node;
+	bake = memnew(Button);
+	bake->set_flat(true);
+	bake->set_icon(editor->get_gui_base()->get_theme_icon("Bake", "EditorIcons"));
+	bake->set_text(TTR("Bake Occluders"));
+	bake->hide();
+	bake->connect("pressed", Callable(this, "_bake"));
+	add_control_to_container(CONTAINER_SPATIAL_EDITOR_MENU, bake);
+	occluder_instance = nullptr;
+
+	file_dialog = memnew(EditorFileDialog);
+	file_dialog->set_file_mode(EditorFileDialog::FILE_MODE_SAVE_FILE);
+	file_dialog->add_filter("*.occ ; Occluder3D");
+	file_dialog->set_title(TTR("Select occluder bake file:"));
+	file_dialog->connect("file_selected", callable_mp(this, &OccluderInstance3DEditorPlugin::_bake_select_file));
+	bake->add_child(file_dialog);
+}
+
+OccluderInstance3DEditorPlugin::~OccluderInstance3DEditorPlugin() {
+}
--- a/editor/plugins/occluder_instance_3d_editor_plugin.h
+++ b/editor/plugins/occluder_instance_3d_editor_plugin.h
@ -0,0 +1,66 @@
+/*************************************************************************/
+/*  occluder_instance_3d_editor_plugin.h                                 */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H
+#define OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H
+
+#include "editor/editor_node.h"
+#include "editor/editor_plugin.h"
+#include "scene/3d/occluder_instance_3d.h"
+#include "scene/resources/material.h"
+
+class OccluderInstance3DEditorPlugin : public EditorPlugin {
+	GDCLASS(OccluderInstance3DEditorPlugin, EditorPlugin);
+
+	OccluderInstance3D *occluder_instance;
+
+	Button *bake;
+	EditorNode *editor;
+
+	EditorFileDialog *file_dialog;
+
+	void _bake_select_file(const String &p_file);
+	void _bake();
+
+protected:
+	static void _bind_methods();
+
+public:
+	virtual String get_name() const override { return "OccluderInstance3D"; }
+	bool has_main_screen() const override { return false; }
+	virtual void edit(Object *p_object) override;
+	virtual bool handles(Object *p_object) const override;
+	virtual void make_visible(bool p_visible) override;
+
+	OccluderInstance3DEditorPlugin(EditorNode *p_node);
+	~OccluderInstance3DEditorPlugin();
+};
+
+#endif
--- a/modules/raycast/SCsub
+++ b/modules/raycast/SCsub
@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+Import("env")
+Import("env_modules")
+
+embree_src = [
+    "common/sys/sysinfo.cpp",
+    "common/sys/alloc.cpp",
+    "common/sys/filename.cpp",
+    "common/sys/library.cpp",
+    "common/sys/thread.cpp",
+    "common/sys/string.cpp",
+    "common/sys/regression.cpp",
+    "common/sys/mutex.cpp",
+    "common/sys/condition.cpp",
+    "common/sys/barrier.cpp",
+    "common/math/constants.cpp",
+    "common/simd/sse.cpp",
+    "common/lexers/stringstream.cpp",
+    "common/lexers/tokenstream.cpp",
+    "common/tasking/taskschedulerinternal.cpp",
+    "common/algorithms/parallel_for.cpp",
+    "common/algorithms/parallel_reduce.cpp",
+    "common/algorithms/parallel_prefix_sum.cpp",
+    "common/algorithms/parallel_for_for.cpp",
+    "common/algorithms/parallel_for_for_prefix_sum.cpp",
+    "common/algorithms/parallel_partition.cpp",
+    "common/algorithms/parallel_sort.cpp",
+    "common/algorithms/parallel_set.cpp",
+    "common/algorithms/parallel_map.cpp",
+    "common/algorithms/parallel_filter.cpp",
+    "kernels/common/device.cpp",
+    "kernels/common/stat.cpp",
+    "kernels/common/acceln.cpp",
+    "kernels/common/accelset.cpp",
+    "kernels/common/state.cpp",
+    "kernels/common/rtcore.cpp",
+    "kernels/common/rtcore_builder.cpp",
+    "kernels/common/scene.cpp",
+    "kernels/common/alloc.cpp",
+    "kernels/common/geometry.cpp",
+    "kernels/common/scene_triangle_mesh.cpp",
+    "kernels/geometry/primitive4.cpp",
+    "kernels/builders/primrefgen.cpp",
+    "kernels/bvh/bvh.cpp",
+    "kernels/bvh/bvh_statistics.cpp",
+    "kernels/bvh/bvh4_factory.cpp",
+    "kernels/bvh/bvh8_factory.cpp",
+    "kernels/bvh/bvh_collider.cpp",
+    "kernels/bvh/bvh_rotate.cpp",
+    "kernels/bvh/bvh_refit.cpp",
+    "kernels/bvh/bvh_builder.cpp",
+    "kernels/bvh/bvh_builder_morton.cpp",
+    "kernels/bvh/bvh_builder_sah.cpp",
+    "kernels/bvh/bvh_builder_sah_spatial.cpp",
+    "kernels/bvh/bvh_builder_sah_mb.cpp",
+    "kernels/bvh/bvh_builder_twolevel.cpp",
+    "kernels/bvh/bvh_intersector1_bvh4.cpp",
+]
+
+embree_dir = "#thirdparty/embree-aarch64/"
+
+env_embree = env_modules.Clone()
+embree_sources = [embree_dir + file for file in embree_src]
+env_embree.Prepend(CPPPATH=[embree_dir, embree_dir + "include"])
+env_embree.Append(CPPFLAGS=["-DEMBREE_TARGET_SSE2", "-DEMBREE_LOWEST_ISA", "-DTASKING_INTERNAL", "-DNDEBUG"])
+
+if not env_embree.msvc:
+    env_embree.Append(CPPFLAGS=["-msse2", "-mxsave"])
+    if env["platform"] == "windows":
+        env_embree.Append(CPPFLAGS=["-mstackrealign"])
+
+if env["platform"] == "windows":
+    if env.msvc:
+        env.Append(LINKFLAGS=["psapi.lib"])
+        env_embree.Append(CPPFLAGS=["-D__SSE2__", "-D__SSE__"])
+    else:
+        env.Append(LIBS=["psapi"])
+
+env_embree.disable_warnings()
+env_embree.add_source_files(env.modules_sources, embree_sources)
+
+env_raycast = env_modules.Clone()
+env_raycast.Prepend(CPPPATH=[embree_dir, embree_dir + "include", embree_dir + "common"])
+
+env_raycast.add_source_files(env.modules_sources, "*.cpp")
--- a/modules/raycast/config.py
+++ b/modules/raycast/config.py
@ -0,0 +1,12 @@
+def can_build(env, platform):
+    if platform == "android":
+        return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
+
+    if platform == "javascript":
+        return False  # No SIMD support yet
+
+    return True
+
+
+def configure(env):
+    pass
--- a/modules/raycast/godot_update_embree.py
+++ b/modules/raycast/godot_update_embree.py
@ -0,0 +1,260 @@
+import glob, os, shutil, subprocess, re
+
+include_dirs = [
+    "common/tasking",
+    "kernels/bvh",
+    "kernels/builders",
+    "common/sys",
+    "kernels",
+    "kernels/common",
+    "common/math",
+    "common/algorithms",
+    "common/lexers",
+    "common/simd",
+    "include/embree3",
+    "kernels/subdiv",
+    "kernels/geometry",
+]
+
+cpp_files = [
+    "common/sys/sysinfo.cpp",
+    "common/sys/alloc.cpp",
+    "common/sys/filename.cpp",
+    "common/sys/library.cpp",
+    "common/sys/thread.cpp",
+    "common/sys/string.cpp",
+    "common/sys/regression.cpp",
+    "common/sys/mutex.cpp",
+    "common/sys/condition.cpp",
+    "common/sys/barrier.cpp",
+    "common/math/constants.cpp",
+    "common/simd/sse.cpp",
+    "common/lexers/stringstream.cpp",
+    "common/lexers/tokenstream.cpp",
+    "common/tasking/taskschedulerinternal.cpp",
+    "common/algorithms/parallel_for.cpp",
+    "common/algorithms/parallel_reduce.cpp",
+    "common/algorithms/parallel_prefix_sum.cpp",
+    "common/algorithms/parallel_for_for.cpp",
+    "common/algorithms/parallel_for_for_prefix_sum.cpp",
+    "common/algorithms/parallel_partition.cpp",
+    "common/algorithms/parallel_sort.cpp",
+    "common/algorithms/parallel_set.cpp",
+    "common/algorithms/parallel_map.cpp",
+    "common/algorithms/parallel_filter.cpp",
+    "kernels/common/device.cpp",
+    "kernels/common/stat.cpp",
+    "kernels/common/acceln.cpp",
+    "kernels/common/accelset.cpp",
+    "kernels/common/state.cpp",
+    "kernels/common/rtcore.cpp",
+    "kernels/common/rtcore_builder.cpp",
+    "kernels/common/scene.cpp",
+    "kernels/common/alloc.cpp",
+    "kernels/common/geometry.cpp",
+    "kernels/common/scene_triangle_mesh.cpp",
+    "kernels/geometry/primitive4.cpp",
+    "kernels/builders/primrefgen.cpp",
+    "kernels/bvh/bvh.cpp",
+    "kernels/bvh/bvh_statistics.cpp",
+    "kernels/bvh/bvh4_factory.cpp",
+    "kernels/bvh/bvh8_factory.cpp",
+    "kernels/bvh/bvh_collider.cpp",
+    "kernels/bvh/bvh_rotate.cpp",
+    "kernels/bvh/bvh_refit.cpp",
+    "kernels/bvh/bvh_builder.cpp",
+    "kernels/bvh/bvh_builder_morton.cpp",
+    "kernels/bvh/bvh_builder_sah.cpp",
+    "kernels/bvh/bvh_builder_sah_spatial.cpp",
+    "kernels/bvh/bvh_builder_sah_mb.cpp",
+    "kernels/bvh/bvh_builder_twolevel.cpp",
+    "kernels/bvh/bvh_intersector1.cpp",
+    "kernels/bvh/bvh_intersector1_bvh4.cpp",
+]
+
+os.chdir("../../thirdparty")
+
+dir_name = "embree-aarch64"
+if os.path.exists(dir_name):
+    shutil.rmtree(dir_name)
+
+subprocess.run(["git", "clone", "https://github.com/lighttransport/embree-aarch64.git", "embree-tmp"])
+os.chdir("embree-tmp")
+
+commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
+
+all_files = set(cpp_files)
+
+dest_dir = os.path.join("..", dir_name)
+for include_dir in include_dirs:
+    headers = glob.iglob(os.path.join(include_dir, "*.h"))
+    all_files.update(headers)
+
+for f in all_files:
+    d = os.path.join(dest_dir, os.path.dirname(f))
+    if not os.path.exists(d):
+        os.makedirs(d)
+    shutil.copy2(f, d)
+
+with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file:
+    hash_file.write(
+        f"""
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_HASH "{commit_hash}"
+"""
+    )
+
+with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:
+    config_file.write(
+        """
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+/* #undef EMBREE_RAY_MASK */
+/* #undef EMBREE_STAT_COUNTERS */
+/* #undef EMBREE_BACKFACE_CULLING */
+/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#define EMBREE_FILTER_FUNCTION
+/* #undef EMBREE_IGNORE_INVALID_RAYS */
+#define EMBREE_GEOMETRY_TRIANGLE
+/* #undef EMBREE_GEOMETRY_QUAD */
+/* #undef EMBREE_GEOMETRY_CURVE */
+/* #undef EMBREE_GEOMETRY_SUBDIVISION */
+/* #undef EMBREE_GEOMETRY_USER */
+/* #undef EMBREE_GEOMETRY_INSTANCE */
+/* #undef EMBREE_GEOMETRY_GRID */
+/* #undef EMBREE_GEOMETRY_POINT */
+/* #undef EMBREE_RAY_PACKETS */
+/* #undef EMBREE_COMPACT_POLYS */
+
+#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+  #define IF_ENABLED_TRIS(x) x
+#else
+  #define IF_ENABLED_TRIS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+  #define IF_ENABLED_QUADS(x) x
+#else
+  #define IF_ENABLED_QUADS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_CURVES_OR_POINTS(x) x
+#else
+  #define IF_ENABLED_CURVES_OR_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+  #define IF_ENABLED_CURVES(x) x
+#else
+  #define IF_ENABLED_CURVES(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_POINTS(x) x
+#else
+  #define IF_ENABLED_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+  #define IF_ENABLED_SUBDIV(x) x
+#else
+  #define IF_ENABLED_SUBDIV(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+  #define IF_ENABLED_USER(x) x
+#else
+  #define IF_ENABLED_USER(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+  #define IF_ENABLED_INSTANCE(x) x
+#else
+  #define IF_ENABLED_INSTANCE(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+  #define IF_ENABLED_GRIDS(x) x
+#else
+  #define IF_ENABLED_GRIDS(x)
+#endif
+"""
+    )
+
+
+with open("CMakeLists.txt", "r") as cmake_file:
+    cmake_content = cmake_file.read()
+    major_version = int(re.compile(r"EMBREE_VERSION_MAJOR\s(\d+)").findall(cmake_content)[0])
+    minor_version = int(re.compile(r"EMBREE_VERSION_MINOR\s(\d+)").findall(cmake_content)[0])
+    patch_version = int(re.compile(r"EMBREE_VERSION_PATCH\s(\d+)").findall(cmake_content)[0])
+
+with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as config_file:
+    config_file.write(
+        f"""
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define RTC_VERSION_MAJOR {major_version}
+#define RTC_VERSION_MINOR {minor_version}
+#define RTC_VERSION_PATCH {patch_version}
+#define RTC_VERSION {major_version}{minor_version:02d}{patch_version:02d}
+#define RTC_VERSION_STRING "{major_version}.{minor_version}.{patch_version}"
+
+#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+
+#define EMBREE_MIN_WIDTH 0
+#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#define EMBREE_STATIC_LIB
+/* #undef EMBREE_API_NAMESPACE */
+
+#if defined(EMBREE_API_NAMESPACE)
+#  define RTC_NAMESPACE
+#  define RTC_NAMESPACE_BEGIN namespace  {{
+#  define RTC_NAMESPACE_END }}
+#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_API_EXTERN_C
+#  undef EMBREE_API_NAMESPACE
+#else
+#  define RTC_NAMESPACE_BEGIN
+#  define RTC_NAMESPACE_END
+#  define RTC_NAMESPACE_USE
+#  if defined(__cplusplus)
+#    define RTC_API_EXTERN_C extern "C"
+#  else
+#    define RTC_API_EXTERN_C
+#  endif
+#endif
+
+#if defined(ISPC)
+#  define RTC_API_IMPORT extern "C" unmasked
+#  define RTC_API_EXPORT extern "C" unmasked
+#elif defined(EMBREE_STATIC_LIB)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C
+#elif defined(_WIN32)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
+#else
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+#if defined(RTC_EXPORT_API)
+#  define RTC_API RTC_API_EXPORT
+#else
+#  define RTC_API RTC_API_IMPORT
+#endif
+"""
+    )
+
+os.chdir("..")
+shutil.rmtree("embree-tmp")
--- a/modules/raycast/lightmap_raycaster.cpp
+++ b/modules/raycast/lightmap_raycaster.cpp
@ -0,0 +1,202 @@
+/*************************************************************************/
+/*  lightmap_raycaster.cpp                                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifdef TOOLS_ENABLED
+
+#include "lightmap_raycaster.h"
+
+// From Embree.
+#include <math/vec2.h>
+#include <math/vec3.h>
+
+#include <pmmintrin.h>
+
+using namespace embree;
+
+LightmapRaycaster *LightmapRaycasterEmbree::create_embree_raycaster() {
+	return memnew(LightmapRaycasterEmbree);
+}
+
+void LightmapRaycasterEmbree::make_default_raycaster() {
+	create_function = create_embree_raycaster;
+}
+
+void LightmapRaycasterEmbree::filter_function(const struct RTCFilterFunctionNArguments *p_args) {
+	RTCHit *hit = (RTCHit *)p_args->hit;
+
+	unsigned int geomID = hit->geomID;
+	float u = hit->u;
+	float v = hit->v;
+
+	LightmapRaycasterEmbree *scene = (LightmapRaycasterEmbree *)p_args->geometryUserPtr;
+	RTCGeometry geom = rtcGetGeometry(scene->embree_scene, geomID);
+
+	rtcInterpolate0(geom, hit->primID, hit->u, hit->v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, &hit->u, 2);
+
+	if (scene->alpha_textures.has(geomID)) {
+		const AlphaTextureData &alpha_texture = scene->alpha_textures[geomID];
+
+		if (alpha_texture.sample(hit->u, hit->v) < 128) {
+			p_args->valid[0] = 0;
+			return;
+		}
+	}
+
+	rtcInterpolate0(geom, hit->primID, u, v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, &hit->Ng_x, 3);
+}
+
+bool LightmapRaycasterEmbree::intersect(Ray &r_ray) {
+	RTCIntersectContext context;
+
+	rtcInitIntersectContext(&context);
+
+	rtcIntersect1(embree_scene, &context, (RTCRayHit *)&r_ray);
+	return r_ray.geomID != RTC_INVALID_GEOMETRY_ID;
+}
+
+void LightmapRaycasterEmbree::intersect(Vector<Ray> &r_rays) {
+	Ray *rays = r_rays.ptrw();
+	for (int i = 0; i < r_rays.size(); ++i) {
+		intersect(rays[i]);
+	}
+}
+
+void LightmapRaycasterEmbree::set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) {
+	if (p_alpha_texture.is_valid() && p_alpha_texture->get_size() != Vector2i()) {
+		AlphaTextureData tex;
+		tex.size = p_alpha_texture->get_size();
+		tex.data = p_alpha_texture->get_data();
+		alpha_textures.insert(p_id, tex);
+	}
+}
+
+float blerp(float c00, float c10, float c01, float c11, float tx, float ty) {
+	return Math::lerp(Math::lerp(c00, c10, tx), Math::lerp(c01, c11, tx), ty);
+}
+
+uint8_t LightmapRaycasterEmbree::AlphaTextureData::sample(float u, float v) const {
+	float x = u * size.x;
+	float y = v * size.y;
+	int xi = (int)x;
+	int yi = (int)y;
+
+	uint8_t texels[4];
+
+	for (int i = 0; i < 4; ++i) {
+		int sample_x = CLAMP(xi + i % 2, 0, size.x - 1);
+		int sample_y = CLAMP(yi + i / 2, 0, size.y - 1);
+		texels[i] = data[sample_y * size.x + sample_x];
+	}
+
+	return Math::round(blerp(texels[0], texels[1], texels[2], texels[3], x - xi, y - yi));
+}
+
+void LightmapRaycasterEmbree::add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) {
+	RTCGeometry embree_mesh = rtcNewGeometry(embree_device, RTC_GEOMETRY_TYPE_TRIANGLE);
+
+	rtcSetGeometryVertexAttributeCount(embree_mesh, 2);
+
+	int vertex_count = p_vertices.size();
+
+	ERR_FAIL_COND(vertex_count % 3 != 0);
+	ERR_FAIL_COND(vertex_count != p_uv2s.size());
+
+	Vec3fa *embree_vertices = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
+	Vec2fa *embree_light_uvs = (Vec2fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, RTC_FORMAT_FLOAT2, sizeof(Vec2fa), vertex_count);
+	uint32_t *embree_triangles = (uint32_t *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(uint32_t) * 3, vertex_count / 3);
+
+	Vec3fa *embree_normals = nullptr;
+	if (!p_normals.is_empty()) {
+		embree_normals = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
+	}
+
+	for (int i = 0; i < vertex_count; i++) {
+		embree_vertices[i] = Vec3fa(p_vertices[i].x, p_vertices[i].y, p_vertices[i].z);
+		embree_light_uvs[i] = Vec2fa(p_uv2s[i].x, p_uv2s[i].y);
+		if (embree_normals != nullptr) {
+			embree_normals[i] = Vec3fa(p_normals[i].x, p_normals[i].y, p_normals[i].z);
+		}
+		embree_triangles[i] = i;
+	}
+
+	rtcCommitGeometry(embree_mesh);
+	rtcSetGeometryIntersectFilterFunction(embree_mesh, filter_function);
+	rtcSetGeometryUserData(embree_mesh, this);
+	rtcAttachGeometryByID(embree_scene, embree_mesh, p_id);
+	rtcReleaseGeometry(embree_mesh);
+}
+
+void LightmapRaycasterEmbree::commit() {
+	rtcCommitScene(embree_scene);
+}
+
+void LightmapRaycasterEmbree::set_mesh_filter(const Set<int> &p_mesh_ids) {
+	for (Set<int>::Element *E = p_mesh_ids.front(); E; E = E->next()) {
+		rtcDisableGeometry(rtcGetGeometry(embree_scene, E->get()));
+	}
+	rtcCommitScene(embree_scene);
+	filter_meshes = p_mesh_ids;
+}
+
+void LightmapRaycasterEmbree::clear_mesh_filter() {
+	for (Set<int>::Element *E = filter_meshes.front(); E; E = E->next()) {
+		rtcEnableGeometry(rtcGetGeometry(embree_scene, E->get()));
+	}
+	rtcCommitScene(embree_scene);
+	filter_meshes.clear();
+}
+
+void embree_error_handler(void *p_user_data, RTCError p_code, const char *p_str) {
+	print_error("Embree error: " + String(p_str));
+}
+
+LightmapRaycasterEmbree::LightmapRaycasterEmbree() {
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+
+	embree_device = rtcNewDevice(nullptr);
+	rtcSetDeviceErrorFunction(embree_device, &embree_error_handler, nullptr);
+	embree_scene = rtcNewScene(embree_device);
+}
+
+LightmapRaycasterEmbree::~LightmapRaycasterEmbree() {
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+
+	if (embree_scene != nullptr) {
+		rtcReleaseScene(embree_scene);
+	}
+
+	if (embree_device != nullptr) {
+		rtcReleaseDevice(embree_device);
+	}
+}
+
+#endif
--- a/modules/raycast/lightmap_raycaster.h
+++ b/modules/raycast/lightmap_raycaster.h
@ -0,0 +1,77 @@
+/*************************************************************************/
+/*  lightmap_raycaster.h                                                 */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifdef TOOLS_ENABLED
+
+#include "core/object/object.h"
+#include "scene/3d/lightmapper.h"
+#include "scene/resources/mesh.h"
+
+#include <embree3/rtcore.h>
+
+class LightmapRaycasterEmbree : public LightmapRaycaster {
+	GDCLASS(LightmapRaycasterEmbree, LightmapRaycaster);
+
+private:
+	struct AlphaTextureData {
+		Vector<uint8_t> data;
+		Vector2i size;
+
+		uint8_t sample(float u, float v) const;
+	};
+
+	RTCDevice embree_device;
+	RTCScene embree_scene;
+
+	static void filter_function(const struct RTCFilterFunctionNArguments *p_args);
+
+	Map<unsigned int, AlphaTextureData> alpha_textures;
+	Set<int> filter_meshes;
+
+public:
+	virtual bool intersect(Ray &p_ray) override;
+
+	virtual void intersect(Vector<Ray> &r_rays) override;
+
+	virtual void add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) override;
+	virtual void set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) override;
+	virtual void commit() override;
+
+	virtual void set_mesh_filter(const Set<int> &p_mesh_ids) override;
+	virtual void clear_mesh_filter() override;
+
+	static LightmapRaycaster *create_embree_raycaster();
+	static void make_default_raycaster();
+
+	LightmapRaycasterEmbree();
+	~LightmapRaycasterEmbree();
+};
+
+#endif
--- a/modules/raycast/raycast_occlusion_cull.cpp
+++ b/modules/raycast/raycast_occlusion_cull.cpp
@ -0,0 +1,583 @@
+/*************************************************************************/
+/*  raycast_occlusion_cull.cpp                                           */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "raycast_occlusion_cull.h"
+#include "core/config/project_settings.h"
+#include "core/templates/local_vector.h"
+
+#ifdef __SSE2__
+#include <pmmintrin.h>
+#endif
+
+RaycastOcclusionCull *RaycastOcclusionCull::raycast_singleton = nullptr;
+
+void RaycastOcclusionCull::RaycastHZBuffer::clear() {
+	HZBuffer::clear();
+
+	camera_rays.clear();
+	camera_ray_masks.clear();
+	packs_size = Size2i();
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) {
+	if (p_size == Size2i()) {
+		clear();
+		return;
+	}
+
+	if (!sizes.is_empty() && p_size == sizes[0]) {
+		return; // Size didn't change
+	}
+
+	HZBuffer::resize(p_size);
+
+	packs_size = Size2i(Math::ceil(p_size.x / (float)TILE_SIZE), Math::ceil(p_size.y / (float)TILE_SIZE));
+	int ray_packets_count = packs_size.x * packs_size.y;
+	camera_rays.resize(ray_packets_count);
+	camera_ray_masks.resize(ray_packets_count * TILE_SIZE * TILE_SIZE);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool) {
+	CameraRayThreadData td;
+	td.camera_matrix = p_cam_projection;
+	td.camera_transform = p_cam_transform;
+	td.camera_orthogonal = p_cam_orthogonal;
+	td.thread_count = p_thread_work_pool.get_thread_count();
+
+	p_thread_work_pool.do_work(td.thread_count, this, &RaycastHZBuffer::_camera_rays_threaded, &td);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thread, RaycastOcclusionCull::RaycastHZBuffer::CameraRayThreadData *p_data) {
+	uint32_t packs_total = camera_rays.size();
+	uint32_t total_threads = p_data->thread_count;
+	uint32_t from = p_thread * packs_total / total_threads;
+	uint32_t to = (p_thread + 1 == total_threads) ? packs_total : ((p_thread + 1) * packs_total / total_threads);
+	_generate_camera_rays(p_data->camera_transform, p_data->camera_matrix, p_data->camera_orthogonal, from, to);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to) {
+	Size2i buffer_size = sizes[0];
+
+	CameraMatrix inv_camera_matrix = p_cam_projection.inverse();
+	float z_far = p_cam_projection.get_z_far() * 1.05f;
+	debug_tex_range = z_far;
+
+	RayPacket *ray_packets = camera_rays.ptr();
+	uint32_t *ray_masks = camera_ray_masks.ptr();
+
+	for (int i = p_from; i < p_to; i++) {
+		RayPacket &packet = ray_packets[i];
+		int tile_x = (i % packs_size.x) * TILE_SIZE;
+		int tile_y = (i / packs_size.x) * TILE_SIZE;
+
+		for (int j = 0; j < TILE_RAYS; j++) {
+			float x = tile_x + j % TILE_SIZE;
+			float y = tile_y + j / TILE_SIZE;
+
+			ray_masks[i * TILE_RAYS + j] = ~0U;
+
+			if (x >= buffer_size.x || y >= buffer_size.y) {
+				ray_masks[i * TILE_RAYS + j] = 0U;
+			} else {
+				float u = x / (buffer_size.x - 1);
+				float v = y / (buffer_size.y - 1);
+				u = u * 2.0f - 1.0f;
+				v = v * 2.0f - 1.0f;
+
+				Plane pixel_proj = Plane(u, v, -1.0, 1.0);
+				Plane pixel_view = inv_camera_matrix.xform4(pixel_proj);
+				Vector3 pixel_world = p_cam_transform.xform(pixel_view.normal);
+
+				Vector3 dir;
+				if (p_cam_orthogonal) {
+					dir = -p_cam_transform.basis.get_axis(2);
+				} else {
+					dir = (pixel_world - p_cam_transform.origin).normalized();
+				}
+
+				packet.ray.org_x[j] = pixel_world.x;
+				packet.ray.org_y[j] = pixel_world.y;
+				packet.ray.org_z[j] = pixel_world.z;
+
+				packet.ray.dir_x[j] = dir.x;
+				packet.ray.dir_y[j] = dir.y;
+				packet.ray.dir_z[j] = dir.z;
+
+				packet.ray.tnear[j] = 0.0f;
+
+				packet.ray.time[j] = 0.0f;
+
+				packet.ray.flags[j] = 0;
+				packet.ray.mask[j] = -1;
+				packet.hit.geomID[j] = RTC_INVALID_GEOMETRY_ID;
+			}
+
+			packet.ray.tfar[j] = z_far;
+		}
+	}
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::sort_rays() {
+	if (is_empty()) {
+		return;
+	}
+
+	Size2i buffer_size = sizes[0];
+	for (int i = 0; i < packs_size.y; i++) {
+		for (int j = 0; j < packs_size.x; j++) {
+			for (int tile_i = 0; tile_i < TILE_SIZE; tile_i++) {
+				for (int tile_j = 0; tile_j < TILE_SIZE; tile_j++) {
+					int x = j * TILE_SIZE + tile_j;
+					int y = i * TILE_SIZE + tile_i;
+					if (x >= buffer_size.x || y >= buffer_size.y) {
+						continue;
+					}
+					int k = tile_i * TILE_SIZE + tile_j;
+					int packet_index = i * packs_size.x + j;
+					mips[0][y * buffer_size.x + x] = camera_rays[packet_index].ray.tfar[k];
+				}
+			}
+		}
+	}
+}
+
+////////////////////////////////////////////////////////
+
+bool RaycastOcclusionCull::is_occluder(RID p_rid) {
+	return occluder_owner.owns(p_rid);
+}
+
+RID RaycastOcclusionCull::occluder_allocate() {
+	return occluder_owner.allocate_rid();
+}
+
+void RaycastOcclusionCull::occluder_initialize(RID p_occluder) {
+	Occluder *occluder = memnew(Occluder);
+	occluder_owner.initialize_rid(p_occluder, occluder);
+}
+
+void RaycastOcclusionCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {
+	Occluder *occluder = occluder_owner.getornull(p_occluder);
+	ERR_FAIL_COND(!occluder);
+
+	occluder->vertices = p_vertices;
+	occluder->indices = p_indices;
+
+	for (Set<InstanceID>::Element *E = occluder->users.front(); E; E = E->next()) {
+		RID scenario_rid = E->get().scenario;
+		RID instance_rid = E->get().instance;
+		ERR_CONTINUE(!scenarios.has(scenario_rid));
+		Scenario &scenario = scenarios[scenario_rid];
+		ERR_CONTINUE(!scenario.instances.has(instance_rid));
+
+		if (!scenario.dirty_instances.has(instance_rid)) {
+			scenario.dirty_instances.insert(instance_rid);
+			scenario.dirty_instances_array.push_back(instance_rid);
+		}
+	}
+}
+
+void RaycastOcclusionCull::free_occluder(RID p_occluder) {
+	Occluder *occluder = occluder_owner.getornull(p_occluder);
+	ERR_FAIL_COND(!occluder);
+	memdelete(occluder);
+	occluder_owner.free(p_occluder);
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::add_scenario(RID p_scenario) {
+	if (scenarios.has(p_scenario)) {
+		scenarios[p_scenario].removed = false;
+	} else {
+		scenarios[p_scenario] = Scenario();
+	}
+}
+
+void RaycastOcclusionCull::remove_scenario(RID p_scenario) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+	scenario.removed = true;
+}
+
+void RaycastOcclusionCull::scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+
+	if (!scenario.instances.has(p_instance)) {
+		scenario.instances[p_instance] = OccluderInstance();
+	}
+
+	OccluderInstance &instance = scenario.instances[p_instance];
+
+	if (instance.removed) {
+		instance.removed = false;
+		scenario.removed_instances.erase(p_instance);
+	}
+
+	bool changed = false;
+
+	if (instance.occluder != p_occluder) {
+		Occluder *old_occluder = occluder_owner.getornull(instance.occluder);
+		if (old_occluder) {
+			old_occluder->users.erase(InstanceID(p_scenario, p_instance));
+		}
+
+		instance.occluder = p_occluder;
+
+		if (p_occluder.is_valid()) {
+			Occluder *occluder = occluder_owner.getornull(p_occluder);
+			ERR_FAIL_COND(!occluder);
+			occluder->users.insert(InstanceID(p_scenario, p_instance));
+		}
+		changed = true;
+	}
+
+	if (instance.xform != p_xform) {
+		scenario.instances[p_instance].xform = p_xform;
+		changed = true;
+	}
+
+	if (instance.enabled != p_enabled) {
+		instance.enabled = p_enabled;
+		scenario.dirty = true; // The scenario needs a scene re-build, but the instance doesn't need update
+	}
+
+	if (changed && !scenario.dirty_instances.has(p_instance)) {
+		scenario.dirty_instances.insert(p_instance);
+		scenario.dirty_instances_array.push_back(p_instance);
+		scenario.dirty = true;
+	}
+}
+
+void RaycastOcclusionCull::scenario_remove_instance(RID p_scenario, RID p_instance) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+
+	if (scenario.instances.has(p_instance)) {
+		OccluderInstance &instance = scenario.instances[p_instance];
+
+		if (!instance.removed) {
+			Occluder *occluder = occluder_owner.getornull(instance.occluder);
+			if (occluder) {
+				occluder->users.erase(InstanceID(p_scenario, p_instance));
+			}
+
+			scenario.removed_instances.push_back(p_instance);
+			instance.removed = true;
+		}
+	}
+}
+
+void RaycastOcclusionCull::Scenario::_update_dirty_instance_thread(int p_idx, RID *p_instances) {
+	_update_dirty_instance(p_idx, p_instances, nullptr);
+}
+
+void RaycastOcclusionCull::Scenario::_update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool) {
+	OccluderInstance *occ_inst = instances.getptr(p_instances[p_idx]);
+
+	if (!occ_inst) {
+		return;
+	}
+
+	Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder);
+
+	if (!occ) {
+		return;
+	}
+
+	int vertices_size = occ->vertices.size();
+
+	// Embree requires the last element to be readable by a 16-byte SSE load instruction, so we add padding to be safe.
+	occ_inst->xformed_vertices.resize(vertices_size + 1);
+
+	const Vector3 *read_ptr = occ->vertices.ptr();
+	Vector3 *write_ptr = occ_inst->xformed_vertices.ptr();
+
+	if (p_thread_pool && vertices_size > 1024) {
+		TransformThreadData td;
+		td.xform = occ_inst->xform;
+		td.read = read_ptr;
+		td.write = write_ptr;
+		td.vertex_count = vertices_size;
+		td.thread_count = p_thread_pool->get_thread_count();
+		p_thread_pool->do_work(td.thread_count, this, &Scenario::_transform_vertices_thread, &td);
+	} else {
+		_transform_vertices_range(read_ptr, write_ptr, occ_inst->xform, 0, vertices_size);
+	}
+
+	occ_inst->indices.resize(occ->indices.size());
+	copymem(occ_inst->indices.ptr(), occ->indices.ptr(), occ->indices.size() * sizeof(int32_t));
+}
+
+void RaycastOcclusionCull::Scenario::_transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data) {
+	uint32_t vertex_total = p_data->vertex_count;
+	uint32_t total_threads = p_data->thread_count;
+	uint32_t from = p_thread * vertex_total / total_threads;
+	uint32_t to = (p_thread + 1 == total_threads) ? vertex_total : ((p_thread + 1) * vertex_total / total_threads);
+	_transform_vertices_range(p_data->read, p_data->write, p_data->xform, from, to);
+}
+
+void RaycastOcclusionCull::Scenario::_transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to) {
+	for (int i = p_from; i < p_to; i++) {
+		p_write[i] = p_xform.xform(p_read[i]);
+	}
+}
+
+void RaycastOcclusionCull::Scenario::_commit_scene(void *p_ud) {
+	Scenario *scenario = (Scenario *)p_ud;
+	int commit_idx = 1 - (scenario->current_scene_idx);
+	rtcCommitScene(scenario->ebr_scene[commit_idx]);
+	scenario->commit_done = true;
+}
+
+bool RaycastOcclusionCull::Scenario::update(ThreadWorkPool &p_thread_pool) {
+	ERR_FAIL_COND_V(singleton == nullptr, false);
+
+	if (commit_thread == nullptr) {
+		commit_thread = memnew(Thread);
+	}
+
+	if (commit_thread->is_started()) {
+		if (commit_done) {
+			commit_thread->wait_to_finish();
+			current_scene_idx = 1 - current_scene_idx;
+		} else {
+			return false;
+		}
+	}
+
+	if (removed) {
+		if (ebr_scene[0]) {
+			rtcReleaseScene(ebr_scene[0]);
+		}
+		if (ebr_scene[1]) {
+			rtcReleaseScene(ebr_scene[1]);
+		}
+		return true;
+	}
+
+	if (!dirty && removed_instances.is_empty() && dirty_instances_array.is_empty()) {
+		return false;
+	}
+
+	for (unsigned int i = 0; i < removed_instances.size(); i++) {
+		instances.erase(removed_instances[i]);
+	}
+
+	if (dirty_instances_array.size() / p_thread_pool.get_thread_count() > 128) {
+		// Lots of instances, use per-instance threading
+		p_thread_pool.do_work(dirty_instances_array.size(), this, &Scenario::_update_dirty_instance_thread, dirty_instances_array.ptr());
+	} else {
+		// Few instances, use threading on the vertex transforms
+		for (unsigned int i = 0; i < dirty_instances_array.size(); i++) {
+			_update_dirty_instance(i, dirty_instances_array.ptr(), &p_thread_pool);
+		}
+	}
+
+	dirty_instances.clear();
+	dirty_instances_array.clear();
+	removed_instances.clear();
+
+	if (raycast_singleton->ebr_device == nullptr) {
+		raycast_singleton->_init_embree();
+	}
+
+	int next_scene_idx = 1 - current_scene_idx;
+	RTCScene &next_scene = ebr_scene[next_scene_idx];
+
+	if (next_scene) {
+		rtcReleaseScene(next_scene);
+	}
+
+	next_scene = rtcNewScene(raycast_singleton->ebr_device);
+	rtcSetSceneBuildQuality(next_scene, RTCBuildQuality(raycast_singleton->build_quality));
+
+	const RID *inst_rid = nullptr;
+	while ((inst_rid = instances.next(inst_rid))) {
+		OccluderInstance *occ_inst = instances.getptr(*inst_rid);
+		Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder);
+
+		if (!occ || !occ_inst->enabled) {
+			continue;
+		}
+
+		RTCGeometry geom = rtcNewGeometry(raycast_singleton->ebr_device, RTC_GEOMETRY_TYPE_TRIANGLE);
+		rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, occ_inst->xformed_vertices.ptr(), 0, sizeof(Vector3), occ_inst->xformed_vertices.size());
+		rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, occ_inst->indices.ptr(), 0, sizeof(uint32_t) * 3, occ_inst->indices.size() / 3);
+		rtcCommitGeometry(geom);
+		rtcAttachGeometry(next_scene, geom);
+		rtcReleaseGeometry(geom);
+	}
+
+	dirty = false;
+	commit_done = false;
+	commit_thread->start(&Scenario::_commit_scene, this);
+	return false;
+}
+
+void RaycastOcclusionCull::Scenario::_raycast(uint32_t p_idx, const RaycastThreadData *p_raycast_data) const {
+	RTCIntersectContext ctx;
+	rtcInitIntersectContext(&ctx);
+	ctx.flags = RTC_INTERSECT_CONTEXT_FLAG_COHERENT;
+
+	rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]);
+}
+
+void RaycastOcclusionCull::Scenario::raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const {
+	ERR_FAIL_COND(singleton == nullptr);
+	if (raycast_singleton->ebr_device == nullptr) {
+		return; // Embree is initialized on demand when there is some scenario with occluders in it.
+	}
+
+	if (ebr_scene[current_scene_idx] == nullptr) {
+		return;
+	}
+
+	RaycastThreadData td;
+	td.rays = r_rays.ptr();
+	td.masks = p_valid_masks.ptr();
+
+	p_thread_pool.do_work(r_rays.size(), this, &Scenario::_raycast, &td);
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::add_buffer(RID p_buffer) {
+	ERR_FAIL_COND(buffers.has(p_buffer));
+	buffers[p_buffer] = RaycastHZBuffer();
+}
+
+void RaycastOcclusionCull::remove_buffer(RID p_buffer) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	buffers.erase(p_buffer);
+}
+
+void RaycastOcclusionCull::buffer_set_scenario(RID p_buffer, RID p_scenario) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	ERR_FAIL_COND(p_scenario.is_valid() && !scenarios.has(p_scenario));
+	buffers[p_buffer].scenario_rid = p_scenario;
+}
+
+void RaycastOcclusionCull::buffer_set_size(RID p_buffer, const Vector2i &p_size) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	buffers[p_buffer].resize(p_size);
+}
+
+void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {
+	if (!buffers.has(p_buffer)) {
+		return;
+	}
+
+	RaycastHZBuffer &buffer = buffers[p_buffer];
+
+	if (buffer.is_empty() || !scenarios.has(buffer.scenario_rid)) {
+		return;
+	}
+
+	Scenario &scenario = scenarios[buffer.scenario_rid];
+
+	bool removed = scenario.update(p_thread_pool);
+
+	if (removed) {
+		scenarios.erase(buffer.scenario_rid);
+		return;
+	}
+
+	buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal, p_thread_pool);
+
+	scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks, p_thread_pool);
+	buffer.sort_rays();
+	buffer.update_mips();
+}
+
+RaycastOcclusionCull::HZBuffer *RaycastOcclusionCull::buffer_get_ptr(RID p_buffer) {
+	if (!buffers.has(p_buffer)) {
+		return nullptr;
+	}
+	return &buffers[p_buffer];
+}
+
+RID RaycastOcclusionCull::buffer_get_debug_texture(RID p_buffer) {
+	ERR_FAIL_COND_V(!buffers.has(p_buffer), RID());
+	return buffers[p_buffer].get_debug_texture();
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {
+	if (build_quality == p_quality) {
+		return;
+	}
+
+	build_quality = p_quality;
+
+	const RID *scenario_rid = nullptr;
+	while ((scenario_rid = scenarios.next(scenario_rid))) {
+		scenarios[*scenario_rid].dirty = true;
+	}
+}
+
+void RaycastOcclusionCull::_init_embree() {
+#ifdef __SSE2__
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+
+	String settings = vformat("threads=%d", MAX(1, OS::get_singleton()->get_processor_count() - 2));
+	ebr_device = rtcNewDevice(settings.utf8().ptr());
+}
+
+RaycastOcclusionCull::RaycastOcclusionCull() {
+	raycast_singleton = this;
+	int default_quality = GLOBAL_GET("rendering/occlusion_culling/bvh_build_quality");
+	build_quality = RS::ViewportOcclusionCullingBuildQuality(default_quality);
+}
+
+RaycastOcclusionCull::~RaycastOcclusionCull() {
+	const RID *scenario_rid = nullptr;
+	while ((scenario_rid = scenarios.next(scenario_rid))) {
+		Scenario &scenario = scenarios[*scenario_rid];
+		if (scenario.commit_thread) {
+			scenario.commit_thread->wait_to_finish();
+			memdelete(scenario.commit_thread);
+		}
+	}
+
+	if (ebr_device != nullptr) {
+#ifdef __SSE2__
+		_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+		_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+#endif
+		rtcReleaseDevice(ebr_device);
+	}
+
+	raycast_singleton = nullptr;
+}
--- a/modules/raycast/raycast_occlusion_cull.h
+++ b/modules/raycast/raycast_occlusion_cull.h
@ -0,0 +1,184 @@
+/*************************************************************************/
+/*  raycast_occlusion_cull.h                                             */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUSION_CULL_RAYCASTER_H
+#define OCCLUSION_CULL_RAYCASTER_H
+
+#include "core/io/image.h"
+#include "core/math/camera_matrix.h"
+#include "core/object/object.h"
+#include "core/object/reference.h"
+#include "core/templates/local_vector.h"
+#include "core/templates/rid_owner.h"
+#include "scene/resources/mesh.h"
+#include "servers/rendering/renderer_scene_occlusion_cull.h"
+
+#include <embree3/rtcore.h>
+
+class RaycastOcclusionCull : public RendererSceneOcclusionCull {
+	typedef RTCRayHit16 RayPacket;
+
+public:
+	class RaycastHZBuffer : public HZBuffer {
+	private:
+		Size2i packs_size;
+
+		struct CameraRayThreadData {
+			CameraMatrix camera_matrix;
+			Transform camera_transform;
+			bool camera_orthogonal;
+			int thread_count;
+			Size2i buffer_size;
+		};
+
+		void _camera_rays_threaded(uint32_t p_thread, CameraRayThreadData *p_data);
+		void _generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to);
+
+	public:
+		LocalVector<RayPacket> camera_rays;
+		LocalVector<uint32_t> camera_ray_masks;
+		RID scenario_rid;
+
+		virtual void clear() override;
+		virtual void resize(const Size2i &p_size) override;
+		void sort_rays();
+		void update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool);
+	};
+
+private:
+	struct InstanceID {
+		RID scenario;
+		RID instance;
+
+		bool operator<(const InstanceID &rhs) const {
+			if (instance == rhs.instance) {
+				return rhs.scenario < scenario;
+			}
+			return instance < rhs.instance;
+		}
+
+		InstanceID() {}
+		InstanceID(RID s, RID i) :
+				scenario(s), instance(i) {}
+	};
+
+	struct Occluder {
+		PackedVector3Array vertices;
+		PackedInt32Array indices;
+		Set<InstanceID> users;
+	};
+
+	struct OccluderInstance {
+		RID occluder;
+		LocalVector<uint32_t> indices;
+		LocalVector<Vector3> xformed_vertices;
+		Transform xform;
+		bool enabled = true;
+		bool removed = false;
+	};
+
+	struct Scenario {
+		struct RaycastThreadData {
+			RayPacket *rays;
+			const uint32_t *masks;
+		};
+
+		struct TransformThreadData {
+			uint32_t thread_count;
+			uint32_t vertex_count;
+			Transform xform;
+			const Vector3 *read;
+			Vector3 *write;
+		};
+
+		Thread *commit_thread = nullptr;
+		bool commit_done = true;
+		bool dirty = false;
+		bool removed = false;
+
+		RTCScene ebr_scene[2] = { nullptr, nullptr };
+		int current_scene_idx = 0;
+
+		HashMap<RID, OccluderInstance> instances;
+		Set<RID> dirty_instances; // To avoid duplicates
+		LocalVector<RID> dirty_instances_array; // To iterate and split into threads
+		LocalVector<RID> removed_instances;
+
+		void _update_dirty_instance_thread(int p_idx, RID *p_instances);
+		void _update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool);
+		void _transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data);
+		void _transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to);
+		static void _commit_scene(void *p_ud);
+		bool update(ThreadWorkPool &p_thread_pool);
+
+		void _raycast(uint32_t p_thread, const RaycastThreadData *p_raycast_data) const;
+		void raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const;
+	};
+
+	static RaycastOcclusionCull *raycast_singleton;
+
+	static const int TILE_SIZE = 4;
+	static const int TILE_RAYS = TILE_SIZE * TILE_SIZE;
+
+	RTCDevice ebr_device = nullptr;
+	RID_PtrOwner<Occluder> occluder_owner;
+	HashMap<RID, Scenario> scenarios;
+	HashMap<RID, RaycastHZBuffer> buffers;
+	RS::ViewportOcclusionCullingBuildQuality build_quality;
+
+	void _init_embree();
+
+public:
+	virtual bool is_occluder(RID p_rid) override;
+	virtual RID occluder_allocate() override;
+	virtual void occluder_initialize(RID p_occluder) override;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) override;
+	virtual void free_occluder(RID p_occluder) override;
+
+	virtual void add_scenario(RID p_scenario) override;
+	virtual void remove_scenario(RID p_scenario) override;
+	virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) override;
+	virtual void scenario_remove_instance(RID p_scenario, RID p_instance) override;
+
+	virtual void add_buffer(RID p_buffer) override;
+	virtual void remove_buffer(RID p_buffer) override;
+	virtual HZBuffer *buffer_get_ptr(RID p_buffer) override;
+	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) override;
+	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) override;
+	virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) override;
+	virtual RID buffer_get_debug_texture(RID p_buffer) override;
+
+	virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) override;
+
+	RaycastOcclusionCull();
+	~RaycastOcclusionCull();
+};
+
+#endif // OCCLUSION_CULL_RAYCASTER_H
--- a/modules/raycast/register_types.cpp
+++ b/modules/raycast/register_types.cpp
@ -0,0 +1,49 @@
+/*************************************************************************/
+/*  register_types.cpp                                                   */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "register_types.h"
+
+#include "lightmap_raycaster.h"
+#include "raycast_occlusion_cull.h"
+
+RaycastOcclusionCull *raycast_occlusion_cull = nullptr;
+
+void register_raycast_types() {
+#ifdef TOOLS_ENABLED
+	LightmapRaycasterEmbree::make_default_raycaster();
+#endif
+	raycast_occlusion_cull = memnew(RaycastOcclusionCull);
+}
+
+void unregister_raycast_types() {
+	if (raycast_occlusion_cull) {
+		memdelete(raycast_occlusion_cull);
+	}
+}
--- a/modules/raycast/register_types.h
+++ b/modules/raycast/register_types.h
@ -0,0 +1,32 @@
+/*************************************************************************/
+/*  register_types.h                                                     */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+void register_raycast_types();
+void unregister_raycast_types();
--- a/scene/3d/baked_lightmap.cpp
+++ b/scene/3d/baked_lightmap.cpp
@ -619,10 +619,6 @@ void BakedLightmap::_gen_new_positions_from_octree(const GenProbesOctree *p_cell
 }

 BakedLightmap::BakeError BakedLightmap::bake(Node *p_from_node, String p_image_data_path, Lightmapper::BakeStepFunc p_bake_step, void *p_bake_userdata) {
-	if (p_image_data_path == "" && (get_light_data().is_null() || !get_light_data()->get_path().is_resource_file())) {
-		return BAKE_ERROR_NO_SAVE_PATH;
-	}
-
 	if (p_image_data_path == "") {
 		if (get_light_data().is_null()) {
 			return BAKE_ERROR_NO_SAVE_PATH;
--- a/scene/3d/lightmapper.cpp
+++ b/scene/3d/lightmapper.cpp
@ -39,6 +39,15 @@ Ref<LightmapDenoiser> LightmapDenoiser::create() {
 	return Ref<LightmapDenoiser>();
 }

+LightmapRaycaster *(*LightmapRaycaster::create_function)() = nullptr;
+
+Ref<LightmapRaycaster> LightmapRaycaster::create() {
+	if (create_function) {
+		return Ref<LightmapRaycaster>(create_function());
+	}
+	return Ref<LightmapRaycaster>();
+}
+
 Lightmapper::CreateFunc Lightmapper::create_custom = nullptr;
 Lightmapper::CreateFunc Lightmapper::create_gpu = nullptr;
 Lightmapper::CreateFunc Lightmapper::create_cpu = nullptr;
--- a/scene/3d/lightmapper.h
+++ b/scene/3d/lightmapper.h
@ -34,6 +34,16 @@
 #include "scene/resources/mesh.h"
 #include "servers/rendering/rendering_device.h"

+#if !defined(__aligned)
+
+#if defined(_WIN32) && defined(_MSC_VER)
+#define __aligned(...) __declspec(align(__VA_ARGS__))
+#else
+#define __aligned(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+
+#endif
+
 class LightmapDenoiser : public Reference {
 	GDCLASS(LightmapDenoiser, Reference)
 protected:
@ -44,6 +54,73 @@ public:
 	static Ref<LightmapDenoiser> create();
 };

+class LightmapRaycaster : public Reference {
+	GDCLASS(LightmapRaycaster, Reference)
+protected:
+	static LightmapRaycaster *(*create_function)();
+
+public:
+	// compatible with embree3 rays
+	struct __aligned(16) Ray {
+		const static unsigned int INVALID_GEOMETRY_ID = ((unsigned int)-1); // from rtcore_common.h
+
+		/*! Default construction does nothing. */
+		_FORCE_INLINE_ Ray() :
+				geomID(INVALID_GEOMETRY_ID) {}
+
+		/*! Constructs a ray from origin, direction, and ray segment. Near
+		 *  has to be smaller than far. */
+		_FORCE_INLINE_ Ray(const Vector3 &org,
+				const Vector3 &dir,
+				float tnear = 0.0f,
+				float tfar = INFINITY) :
+				org(org),
+				tnear(tnear),
+				dir(dir),
+				time(0.0f),
+				tfar(tfar),
+				mask(-1),
+				u(0.0),
+				v(0.0),
+				primID(INVALID_GEOMETRY_ID),
+				geomID(INVALID_GEOMETRY_ID),
+				instID(INVALID_GEOMETRY_ID) {}
+
+		/*! Tests if we hit something. */
+		_FORCE_INLINE_ explicit operator bool() const { return geomID != INVALID_GEOMETRY_ID; }
+
+	public:
+		Vector3 org; //!< Ray origin + tnear
+		float tnear; //!< Start of ray segment
+		Vector3 dir; //!< Ray direction + tfar
+		float time; //!< Time of this ray for motion blur.
+		float tfar; //!< End of ray segment
+		unsigned int mask; //!< used to mask out objects during traversal
+		unsigned int id; //!< ray ID
+		unsigned int flags; //!< ray flags
+
+		Vector3 normal; //!< Not normalized geometry normal
+		float u; //!< Barycentric u coordinate of hit
+		float v; //!< Barycentric v coordinate of hit
+		unsigned int primID; //!< primitive ID
+		unsigned int geomID; //!< geometry ID
+		unsigned int instID; //!< instance ID
+	};
+
+	virtual bool intersect(Ray &p_ray) = 0;
+
+	virtual void intersect(Vector<Ray> &r_rays) = 0;
+
+	virtual void add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) = 0;
+	virtual void set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) = 0;
+	virtual void commit() = 0;
+
+	virtual void set_mesh_filter(const Set<int> &p_mesh_ids) = 0;
+	virtual void clear_mesh_filter() = 0;
+
+	static Ref<LightmapRaycaster> create();
+};
+
 class Lightmapper : public Reference {
 	GDCLASS(Lightmapper, Reference)
 public:
--- a/scene/3d/mesh_instance_3d.cpp
+++ b/scene/3d/mesh_instance_3d.cpp
@ -356,6 +356,7 @@ Ref<Material> MeshInstance3D::get_active_material(int p_surface) const {
 void MeshInstance3D::_mesh_changed() {
 	ERR_FAIL_COND(mesh.is_null());
 	surface_override_materials.resize(mesh->get_surface_count());
+	update_gizmo();
 }

 void MeshInstance3D::create_debug_tangents() {
--- a/scene/3d/occluder_instance_3d.cpp
+++ b/scene/3d/occluder_instance_3d.cpp
@ -0,0 +1,335 @@
+/*************************************************************************/
+/*  occluder_instance_3d.cpp                                             */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "occluder_instance_3d.h"
+#include "core/core_string_names.h"
+#include "scene/3d/mesh_instance_3d.h"
+
+RID Occluder3D::get_rid() const {
+	if (!occluder.is_valid()) {
+		occluder = RS::get_singleton()->occluder_create();
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	return occluder;
+}
+
+void Occluder3D::set_vertices(PackedVector3Array p_vertices) {
+	vertices = p_vertices;
+	if (occluder.is_valid()) {
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	_update_changes();
+}
+
+PackedVector3Array Occluder3D::get_vertices() const {
+	return vertices;
+}
+
+void Occluder3D::set_indices(PackedInt32Array p_indices) {
+	indices = p_indices;
+	if (occluder.is_valid()) {
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	_update_changes();
+}
+
+PackedInt32Array Occluder3D::get_indices() const {
+	return indices;
+}
+
+void Occluder3D::_update_changes() {
+	aabb = AABB();
+
+	const Vector3 *ptr = vertices.ptr();
+	for (int i = 0; i < vertices.size(); i++) {
+		aabb.expand_to(ptr[i]);
+	}
+
+	debug_lines.clear();
+	debug_mesh.unref();
+
+	emit_changed();
+}
+
+Vector<Vector3> Occluder3D::get_debug_lines() const {
+	if (!debug_lines.is_empty()) {
+		return debug_lines;
+	}
+
+	if (indices.size() % 3 != 0) {
+		return Vector<Vector3>();
+	}
+
+	for (int i = 0; i < indices.size() / 3; i++) {
+		for (int j = 0; j < 3; j++) {
+			int a = indices[i * 3 + j];
+			int b = indices[i * 3 + (j + 1) % 3];
+			ERR_FAIL_INDEX_V_MSG(a, vertices.size(), Vector<Vector3>(), "Occluder indices are out of range.");
+			ERR_FAIL_INDEX_V_MSG(b, vertices.size(), Vector<Vector3>(), "Occluder indices are out of range.");
+			debug_lines.push_back(vertices[a]);
+			debug_lines.push_back(vertices[b]);
+		}
+	}
+	return debug_lines;
+}
+
+Ref<ArrayMesh> Occluder3D::get_debug_mesh() const {
+	if (debug_mesh.is_valid()) {
+		return debug_mesh;
+	}
+
+	if (indices.size() % 3 != 0) {
+		return debug_mesh;
+	}
+
+	Array arrays;
+	arrays.resize(Mesh::ARRAY_MAX);
+	arrays[Mesh::ARRAY_VERTEX] = vertices;
+	arrays[Mesh::ARRAY_INDEX] = indices;
+
+	debug_mesh.instance();
+	debug_mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, arrays);
+	return debug_mesh;
+}
+
+AABB Occluder3D::get_aabb() const {
+	return aabb;
+}
+
+void Occluder3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_vertices", "vertices"), &Occluder3D::set_vertices);
+	ClassDB::bind_method(D_METHOD("get_vertices"), &Occluder3D::get_vertices);
+
+	ClassDB::bind_method(D_METHOD("set_indices", "indices"), &Occluder3D::set_indices);
+	ClassDB::bind_method(D_METHOD("get_indices"), &Occluder3D::get_indices);
+
+	ADD_PROPERTY(PropertyInfo(Variant::PACKED_VECTOR3_ARRAY, "vertices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_vertices", "get_vertices");
+	ADD_PROPERTY(PropertyInfo(Variant::PACKED_INT32_ARRAY, "indices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_indices", "get_indices");
+}
+
+Occluder3D::Occluder3D() {
+}
+
+Occluder3D::~Occluder3D() {
+	if (occluder.is_valid()) {
+		RS::get_singleton()->free(occluder);
+	}
+}
+/////////////////////////////////////////////////
+
+AABB OccluderInstance3D::get_aabb() const {
+	if (occluder.is_valid()) {
+		return occluder->get_aabb();
+	}
+	return AABB();
+}
+
+Vector<Face3> OccluderInstance3D::get_faces(uint32_t p_usage_flags) const {
+	return Vector<Face3>();
+}
+
+void OccluderInstance3D::set_occluder(const Ref<Occluder3D> &p_occluder) {
+	if (occluder == p_occluder) {
+		return;
+	}
+
+	if (occluder.is_valid()) {
+		occluder->disconnect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed));
+	}
+
+	occluder = p_occluder;
+
+	if (occluder.is_valid()) {
+		set_base(occluder->get_rid());
+		occluder->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed));
+	} else {
+		set_base(RID());
+	}
+
+	update_gizmo();
+}
+
+void OccluderInstance3D::_occluder_changed() {
+	update_gizmo();
+}
+
+Ref<Occluder3D> OccluderInstance3D::get_occluder() const {
+	return occluder;
+}
+
+void OccluderInstance3D::set_bake_mask(uint32_t p_mask) {
+	bake_mask = p_mask;
+}
+
+uint32_t OccluderInstance3D::get_bake_mask() const {
+	return bake_mask;
+}
+
+void OccluderInstance3D::set_bake_mask_bit(int p_layer, bool p_enable) {
+	ERR_FAIL_INDEX(p_layer, 32);
+	if (p_enable) {
+		set_bake_mask(bake_mask | (1 << p_layer));
+	} else {
+		set_bake_mask(bake_mask & (~(1 << p_layer)));
+	}
+}
+
+bool OccluderInstance3D::get_bake_mask_bit(int p_layer) const {
+	ERR_FAIL_INDEX_V(p_layer, 32, false);
+	return (bake_mask & (1 << p_layer));
+}
+
+bool OccluderInstance3D::_bake_material_check(Ref<Material> p_material) {
+	StandardMaterial3D *standard_mat = Object::cast_to<StandardMaterial3D>(p_material.ptr());
+	if (standard_mat && standard_mat->get_transparency() != StandardMaterial3D::TRANSPARENCY_DISABLED) {
+		return false;
+	}
+	return true;
+}
+
+void OccluderInstance3D::_bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices) {
+	MeshInstance3D *mi = Object::cast_to<MeshInstance3D>(p_node);
+	if (mi && mi->is_visible_in_tree()) {
+		Ref<Mesh> mesh = mi->get_mesh();
+		bool valid = true;
+
+		if (mesh.is_null()) {
+			valid = false;
+		}
+
+		if (valid && !_bake_material_check(mi->get_material_override())) {
+			valid = false;
+		}
+
+		if ((mi->get_layer_mask() & bake_mask) == 0) {
+			valid = false;
+		}
+
+		if (valid) {
+			Transform global_to_local = get_global_transform().affine_inverse() * mi->get_global_transform();
+
+			for (int i = 0; i < mesh->get_surface_count(); i++) {
+				if (mesh->surface_get_primitive_type(i) != Mesh::PRIMITIVE_TRIANGLES) {
+					continue;
+				}
+
+				if (mi->get_surface_override_material(i).is_valid()) {
+					if (!_bake_material_check(mi->get_surface_override_material(i))) {
+						continue;
+					}
+				} else {
+					if (!_bake_material_check(mesh->surface_get_material(i))) {
+						continue;
+					}
+				}
+
+				Array arrays = mesh->surface_get_arrays(i);
+
+				int vertex_offset = r_vertices.size();
+				PackedVector3Array vertices = arrays[Mesh::ARRAY_VERTEX];
+				r_vertices.resize(r_vertices.size() + vertices.size());
+
+				Vector3 *vtx_ptr = r_vertices.ptrw();
+				for (int j = 0; j < vertices.size(); j++) {
+					vtx_ptr[vertex_offset + j] = global_to_local.xform(vertices[j]);
+				}
+
+				int index_offset = r_indices.size();
+				PackedInt32Array indices = arrays[Mesh::ARRAY_INDEX];
+				r_indices.resize(r_indices.size() + indices.size());
+
+				int *idx_ptr = r_indices.ptrw();
+				for (int j = 0; j < indices.size(); j++) {
+					idx_ptr[index_offset + j] = vertex_offset + indices[j];
+				}
+			}
+		}
+	}
+
+	for (int i = 0; i < p_node->get_child_count(); i++) {
+		Node *child = p_node->get_child(i);
+		if (!child->get_owner()) {
+			continue; //maybe a helper
+		}
+
+		_bake_node(child, r_vertices, r_indices);
+	}
+}
+
+OccluderInstance3D::BakeError OccluderInstance3D::bake(Node *p_from_node, String p_occluder_path) {
+	if (p_occluder_path == "") {
+		if (get_occluder().is_null()) {
+			return BAKE_ERROR_NO_SAVE_PATH;
+		}
+	}
+
+	PackedVector3Array vertices;
+	PackedInt32Array indices;
+
+	_bake_node(p_from_node, vertices, indices);
+
+	if (vertices.is_empty() || indices.is_empty()) {
+		return BAKE_ERROR_NO_MESHES;
+	}
+
+	Ref<Occluder3D> occ;
+	if (get_occluder().is_valid()) {
+		occ = get_occluder();
+	} else {
+		occ.instance();
+		occ->set_path(p_occluder_path);
+	}
+
+	occ->set_vertices(vertices);
+	occ->set_indices(indices);
+	set_occluder(occ);
+
+	return BAKE_ERROR_OK;
+}
+
+void OccluderInstance3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_bake_mask", "mask"), &OccluderInstance3D::set_bake_mask);
+	ClassDB::bind_method(D_METHOD("get_bake_mask"), &OccluderInstance3D::get_bake_mask);
+	ClassDB::bind_method(D_METHOD("set_bake_mask_bit", "layer", "enabled"), &OccluderInstance3D::set_bake_mask_bit);
+	ClassDB::bind_method(D_METHOD("get_bake_mask_bit", "layer"), &OccluderInstance3D::get_bake_mask_bit);
+
+	ClassDB::bind_method(D_METHOD("set_occluder", "occluder"), &OccluderInstance3D::set_occluder);
+	ClassDB::bind_method(D_METHOD("get_occluder"), &OccluderInstance3D::get_occluder);
+
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "occluder", PROPERTY_HINT_RESOURCE_TYPE, "Occluder3D"), "set_occluder", "get_occluder");
+	ADD_GROUP("Bake", "bake_");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "bake_mask", PROPERTY_HINT_LAYERS_3D_RENDER), "set_bake_mask", "get_bake_mask");
+}
+
+OccluderInstance3D::OccluderInstance3D() {
+}
+
+OccluderInstance3D::~OccluderInstance3D() {
+}
--- a/scene/3d/occluder_instance_3d.h
+++ b/scene/3d/occluder_instance_3d.h
@ -0,0 +1,108 @@
+/*************************************************************************/
+/*  occluder_instance_3d.h                                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUDER_INSTANCE_3D_H
+#define OCCLUDER_INSTANCE_3D_H
+
+#include "scene/3d/visual_instance_3d.h"
+
+class Occluder3D : public Resource {
+	GDCLASS(Occluder3D, Resource);
+	RES_BASE_EXTENSION("occ");
+
+	mutable RID occluder;
+	mutable Ref<ArrayMesh> debug_mesh;
+	mutable Vector<Vector3> debug_lines;
+	AABB aabb;
+
+	PackedVector3Array vertices;
+	PackedInt32Array indices;
+
+	void _update_changes();
+
+protected:
+	static void _bind_methods();
+
+public:
+	void set_vertices(PackedVector3Array p_vertices);
+	PackedVector3Array get_vertices() const;
+
+	void set_indices(PackedInt32Array p_indices);
+	PackedInt32Array get_indices() const;
+
+	Vector<Vector3> get_debug_lines() const;
+	Ref<ArrayMesh> get_debug_mesh() const;
+	AABB get_aabb() const;
+
+	virtual RID get_rid() const override;
+	Occluder3D();
+	~Occluder3D();
+};
+
+class OccluderInstance3D : public VisualInstance3D {
+	GDCLASS(OccluderInstance3D, Node3D);
+
+private:
+	Ref<Occluder3D> occluder;
+	uint32_t bake_mask = 0xFFFFFFFF;
+
+	void _occluder_changed();
+
+	bool _bake_material_check(Ref<Material> p_material);
+	void _bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices);
+
+protected:
+	static void _bind_methods();
+
+public:
+	enum BakeError {
+		BAKE_ERROR_OK,
+		BAKE_ERROR_NO_SAVE_PATH,
+		BAKE_ERROR_NO_MESHES,
+	};
+
+	void set_occluder(const Ref<Occluder3D> &p_occluder);
+	Ref<Occluder3D> get_occluder() const;
+
+	virtual AABB get_aabb() const override;
+	virtual Vector<Face3> get_faces(uint32_t p_usage_flags) const override;
+
+	void set_bake_mask(uint32_t p_mask);
+	uint32_t get_bake_mask() const;
+
+	void set_bake_mask_bit(int p_layer, bool p_enable);
+	bool get_bake_mask_bit(int p_layer) const;
+	BakeError bake(Node *p_from_node, String p_occluder_path = "");
+
+	OccluderInstance3D();
+	~OccluderInstance3D();
+};
+
+#endif
--- a/scene/3d/visual_instance_3d.cpp
+++ b/scene/3d/visual_instance_3d.cpp
@ -338,6 +338,15 @@ GeometryInstance3D::GIMode GeometryInstance3D::get_gi_mode() const {
 	return gi_mode;
 }

+void GeometryInstance3D::set_ignore_occlusion_culling(bool p_enabled) {
+	ignore_occlusion_culling = p_enabled;
+	RS::get_singleton()->instance_geometry_set_flag(get_instance(), RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, ignore_occlusion_culling);
+}
+
+bool GeometryInstance3D::is_ignoring_occlusion_culling() {
+	return ignore_occlusion_culling;
+}
+
 void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_material_override", "material"), &GeometryInstance3D::set_material_override);
 	ClassDB::bind_method(D_METHOD("get_material_override"), &GeometryInstance3D::get_material_override);
@ -345,21 +354,24 @@ void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_cast_shadows_setting", "shadow_casting_setting"), &GeometryInstance3D::set_cast_shadows_setting);
 	ClassDB::bind_method(D_METHOD("get_cast_shadows_setting"), &GeometryInstance3D::get_cast_shadows_setting);

+	ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias);
+	ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias);
+
 	ClassDB::bind_method(D_METHOD("set_lod_max_hysteresis", "mode"), &GeometryInstance3D::set_lod_max_hysteresis);
 	ClassDB::bind_method(D_METHOD("get_lod_max_hysteresis"), &GeometryInstance3D::get_lod_max_hysteresis);

 	ClassDB::bind_method(D_METHOD("set_lod_max_distance", "mode"), &GeometryInstance3D::set_lod_max_distance);
 	ClassDB::bind_method(D_METHOD("get_lod_max_distance"), &GeometryInstance3D::get_lod_max_distance);

-	ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform);
-	ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform);
-
 	ClassDB::bind_method(D_METHOD("set_lod_min_hysteresis", "mode"), &GeometryInstance3D::set_lod_min_hysteresis);
 	ClassDB::bind_method(D_METHOD("get_lod_min_hysteresis"), &GeometryInstance3D::get_lod_min_hysteresis);

 	ClassDB::bind_method(D_METHOD("set_lod_min_distance", "mode"), &GeometryInstance3D::set_lod_min_distance);
 	ClassDB::bind_method(D_METHOD("get_lod_min_distance"), &GeometryInstance3D::get_lod_min_distance);

+	ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform);
+	ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform);
+
 	ClassDB::bind_method(D_METHOD("set_extra_cull_margin", "margin"), &GeometryInstance3D::set_extra_cull_margin);
 	ClassDB::bind_method(D_METHOD("get_extra_cull_margin"), &GeometryInstance3D::get_extra_cull_margin);

@ -369,8 +381,8 @@ void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_gi_mode", "mode"), &GeometryInstance3D::set_gi_mode);
 	ClassDB::bind_method(D_METHOD("get_gi_mode"), &GeometryInstance3D::get_gi_mode);

-	ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias);
-	ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias);
+	ClassDB::bind_method(D_METHOD("set_ignore_occlusion_culling", "ignore_culling"), &GeometryInstance3D::set_ignore_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("is_ignoring_occlusion_culling"), &GeometryInstance3D::is_ignoring_occlusion_culling);

 	ClassDB::bind_method(D_METHOD("set_custom_aabb", "aabb"), &GeometryInstance3D::set_custom_aabb);

@ -381,6 +393,7 @@ void GeometryInstance3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "cast_shadow", PROPERTY_HINT_ENUM, "Off,On,Double-Sided,Shadows Only"), "set_cast_shadows_setting", "get_cast_shadows_setting");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "extra_cull_margin", PROPERTY_HINT_RANGE, "0,16384,0.01"), "set_extra_cull_margin", "get_extra_cull_margin");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_bias", PROPERTY_HINT_RANGE, "0.001,128,0.001"), "set_lod_bias", "get_lod_bias");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "ignore_occlusion_culling"), "set_ignore_occlusion_culling", "is_ignoring_occlusion_culling");
 	ADD_GROUP("Global Illumination", "gi_");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_mode", PROPERTY_HINT_ENUM, "Disabled,Baked,Dynamic"), "set_gi_mode", "get_gi_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_lightmap_scale", PROPERTY_HINT_ENUM, "1x,2x,4x,8x"), "set_lightmap_scale", "get_lightmap_scale");
--- a/scene/3d/visual_instance_3d.h
+++ b/scene/3d/visual_instance_3d.h
@ -120,6 +120,7 @@ private:
 	float extra_cull_margin = 0.0;
 	LightmapScale lightmap_scale = LIGHTMAP_SCALE_1X;
 	GIMode gi_mode = GI_MODE_DISABLED;
+	bool ignore_occlusion_culling = false;

 	const StringName *_instance_uniform_get_remap(const StringName p_name) const;

@ -167,6 +168,9 @@ public:

 	void set_custom_aabb(AABB aabb);

+	void set_ignore_occlusion_culling(bool p_enabled);
+	bool is_ignoring_occlusion_culling();
+
 	GeometryInstance3D();
 };

--- a/scene/main/scene_tree.cpp
+++ b/scene/main/scene_tree.cpp
@ -1378,6 +1378,9 @@ SceneTree::SceneTree() {
 	const bool use_debanding = GLOBAL_DEF("rendering/anti_aliasing/quality/use_debanding", false);
 	root->set_use_debanding(use_debanding);

+	const bool use_occlusion_culling = GLOBAL_DEF("rendering/occlusion_culling/use_occlusion_culling", false);
+	root->set_use_occlusion_culling(use_occlusion_culling);
+
 	float lod_threshold = GLOBAL_DEF("rendering/mesh_lod/lod_change/threshold_pixels", 1.0);
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/mesh_lod/lod_change/threshold_pixels", PropertyInfo(Variant::FLOAT, "rendering/mesh_lod/lod_change/threshold_pixels", PROPERTY_HINT_RANGE, "0,1024,0.1"));
 	root->set_lod_threshold(lod_threshold);
--- a/scene/main/viewport.cpp
+++ b/scene/main/viewport.cpp
@ -3242,6 +3242,21 @@ float Viewport::get_lod_threshold() const {
 	return lod_threshold;
 }

+void Viewport::set_use_occlusion_culling(bool p_use_occlusion_culling) {
+	if (use_occlusion_culling == p_use_occlusion_culling) {
+		return;
+	}
+
+	use_occlusion_culling = p_use_occlusion_culling;
+	RS::get_singleton()->viewport_set_use_occlusion_culling(viewport, p_use_occlusion_culling);
+
+	notify_property_list_changed();
+}
+
+bool Viewport::is_using_occlusion_culling() const {
+	return use_occlusion_culling;
+}
+
 void Viewport::set_debug_draw(DebugDraw p_debug_draw) {
 	debug_draw = p_debug_draw;
 	RS::get_singleton()->viewport_set_debug_draw(viewport, RS::ViewportDebugDraw(p_debug_draw));
@ -3331,9 +3346,6 @@ bool Viewport::is_handling_input_locally() const {
 	return handle_input_locally;
 }

-void Viewport::_validate_property(PropertyInfo &property) const {
-}
-
 void Viewport::set_default_canvas_item_texture_filter(DefaultCanvasItemTextureFilter p_filter) {
 	ERR_FAIL_INDEX(p_filter, DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_MAX);

@ -3478,6 +3490,9 @@ void Viewport::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_use_debanding", "enable"), &Viewport::set_use_debanding);
 	ClassDB::bind_method(D_METHOD("is_using_debanding"), &Viewport::is_using_debanding);

+	ClassDB::bind_method(D_METHOD("set_use_occlusion_culling", "enable"), &Viewport::set_use_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("is_using_occlusion_culling"), &Viewport::is_using_occlusion_culling);
+
 	ClassDB::bind_method(D_METHOD("set_debug_draw", "debug_draw"), &Viewport::set_debug_draw);
 	ClassDB::bind_method(D_METHOD("get_debug_draw"), &Viewport::get_debug_draw);

@ -3574,6 +3589,7 @@ void Viewport::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "msaa", PROPERTY_HINT_ENUM, "Disabled,2x,4x,8x,16x,AndroidVR 2x,AndroidVR 4x"), "set_msaa", "get_msaa");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "screen_space_aa", PROPERTY_HINT_ENUM, "Disabled,FXAA"), "set_screen_space_aa", "get_screen_space_aa");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_debanding"), "set_use_debanding", "is_using_debanding");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_occlusion_culling"), "set_use_occlusion_culling", "is_using_occlusion_culling");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_threshold", PROPERTY_HINT_RANGE, "0,1024,0.1"), "set_lod_threshold", "get_lod_threshold");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "debug_draw", PROPERTY_HINT_ENUM, "Disabled,Unshaded,Overdraw,Wireframe"), "set_debug_draw", "get_debug_draw");
 	ADD_GROUP("Canvas Items", "canvas_item_");
@ -3655,6 +3671,7 @@ void Viewport::_bind_methods() {
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_SPOT_LIGHTS);
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_DECALS);
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_REFLECTION_PROBES);
+	BIND_ENUM_CONSTANT(DEBUG_DRAW_OCCLUDERS)

 	BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST);
 	BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_LINEAR);
--- a/scene/main/viewport.h
+++ b/scene/main/viewport.h
@ -147,6 +147,7 @@ public:
 		DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 		DEBUG_DRAW_CLUSTER_DECALS,
 		DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+		DEBUG_DRAW_OCCLUDERS,
 	};

 	enum DefaultCanvasItemTextureFilter {
@ -304,6 +305,7 @@ private:
 	ScreenSpaceAA screen_space_aa = SCREEN_SPACE_AA_DISABLED;
 	bool use_debanding = false;
 	float lod_threshold = 1.0;
+	bool use_occlusion_culling = false;

 	Ref<ViewportTexture> default_texture;
 	Set<ViewportTexture *> viewport_textures;
@ -480,7 +482,6 @@ protected:
 	void _notification(int p_what);
 	void _process_picking();
 	static void _bind_methods();
-	virtual void _validate_property(PropertyInfo &property) const override;

 public:
 	uint64_t get_processed_events_count() const { return event_count; }
@ -556,6 +557,9 @@ public:
 	void set_lod_threshold(float p_pixels);
 	float get_lod_threshold() const;

+	void set_use_occlusion_culling(bool p_us_occlusion_culling);
+	bool is_using_occlusion_culling() const;
+
 	Vector2 get_camera_coords(const Vector2 &p_viewport_coords) const;
 	Vector2 get_camera_rect_size() const;

--- a/scene/register_scene_types.cpp
+++ b/scene/register_scene_types.cpp
@ -208,6 +208,7 @@
 #include "scene/3d/navigation_agent_3d.h"
 #include "scene/3d/navigation_obstacle_3d.h"
 #include "scene/3d/navigation_region_3d.h"
+#include "scene/3d/occluder_instance_3d.h"
 #include "scene/3d/path_3d.h"
 #include "scene/3d/physics_body_3d.h"
 #include "scene/3d/physics_joint_3d.h"
@ -442,6 +443,8 @@ void register_scene_types() {
 	ClassDB::register_class<XRAnchor3D>();
 	ClassDB::register_class<XROrigin3D>();
 	ClassDB::register_class<MeshInstance3D>();
+	ClassDB::register_class<OccluderInstance3D>();
+	ClassDB::register_class<Occluder3D>();
 	ClassDB::register_class<ImmediateGeometry3D>();
 	ClassDB::register_virtual_class<SpriteBase3D>();
 	ClassDB::register_class<Sprite3D>();
--- a/scene/resources/world_3d.cpp
+++ b/scene/resources/world_3d.cpp
@ -321,7 +321,7 @@ void World3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_environment"), &World3D::get_environment);
 	ClassDB::bind_method(D_METHOD("set_fallback_environment", "env"), &World3D::set_fallback_environment);
 	ClassDB::bind_method(D_METHOD("get_fallback_environment"), &World3D::get_fallback_environment);
-	ClassDB::bind_method(D_METHOD("set_camera_effects", "env"), &World3D::set_camera_effects);
+	ClassDB::bind_method(D_METHOD("set_camera_effects", "effects"), &World3D::set_camera_effects);
 	ClassDB::bind_method(D_METHOD("get_camera_effects"), &World3D::get_camera_effects);
 	ClassDB::bind_method(D_METHOD("get_direct_space_state"), &World3D::get_direct_space_state);
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "environment", PROPERTY_HINT_RESOURCE_TYPE, "Environment"), "set_environment", "get_environment");
--- a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
@ -1873,7 +1873,7 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(RID p_rende
 	storage->render_target_disable_clear_request(rb->render_target);
 }

-void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas) {
+void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer) {
 	EffectsRD *effects = storage->get_effects();

 	RenderBuffers *rb = render_buffers_owner.getornull(p_render_buffers);
@ -1932,6 +1932,13 @@ void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID
 		RID reflection_texture = rb->reflection_buffer;
 		effects->copy_to_fb_rect(ambient_texture, storage->render_target_get_rd_framebuffer(rb->render_target), Rect2(Vector2(), rtsize), false, false, false, true, reflection_texture);
 	}
+
+	if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_OCCLUDERS) {
+		if (p_occlusion_buffer.is_valid()) {
+			Size2 rtsize = storage->render_target_get_size(rb->render_target);
+			effects->copy_to_fb_rect(storage->texture_get_rd_texture(p_occlusion_buffer), storage->render_target_get_rd_framebuffer(rb->render_target), Rect2i(Vector2(), rtsize), true, false);
+		}
+	}
 }

 void RendererSceneRenderRD::environment_set_adjustment(RID p_env, bool p_enable, float p_brightness, float p_contrast, float p_saturation, bool p_use_1d_color_correction, RID p_color_correction) {
@ -3516,7 +3523,7 @@ void RendererSceneRenderRD::_pre_opaque_render(bool p_use_ssao, bool p_use_gi, R
 	}
 }

-void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) {
+void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) {
 	// getting this here now so we can direct call a bunch of things more easily
 	RenderBuffers *rb = nullptr;
 	if (p_render_buffers.is_valid()) {
@ -3643,7 +3650,7 @@ void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &
 		RENDER_TIMESTAMP("Tonemap");

 		_render_buffers_post_process_and_tonemap(p_render_buffers, p_environment, p_camera_effects, p_cam_projection);
-		_render_buffers_debug_draw(p_render_buffers, p_shadow_atlas);
+		_render_buffers_debug_draw(p_render_buffers, p_shadow_atlas, p_occluder_debug_tex);
 		if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_SDFGI && rb != nullptr && rb->sdfgi != nullptr) {
 			rb->sdfgi->debug_draw(p_cam_projection, p_cam_transform, rb->width, rb->height, rb->render_target, rb->texture);
 		}
--- a/servers/rendering/renderer_rd/renderer_scene_render_rd.h
+++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.h
@ -441,7 +441,7 @@ private:
 	void _allocate_blur_textures(RenderBuffers *rb);
 	void _allocate_luminance_textures(RenderBuffers *rb);

-	void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas);
+	void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer);
 	void _render_buffers_post_process_and_tonemap(RID p_render_buffers, RID p_environment, RID p_camera_effects, const CameraMatrix &p_projection);

 	/* Cluster */
@ -1125,7 +1125,7 @@ public:
 	float render_buffers_get_volumetric_fog_end(RID p_render_buffers);
 	float render_buffers_get_volumetric_fog_detail_spread(RID p_render_buffers);

-	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr);
+	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr);

 	void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);

--- a/servers/rendering/renderer_scene.h
+++ b/servers/rendering/renderer_scene.h
@ -49,6 +49,10 @@ public:
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0;
 	virtual bool is_camera(RID p_camera) const = 0;

+	virtual RID occluder_allocate() = 0;
+	virtual void occluder_initialize(RID p_occluder) = 0;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0;
+
 	virtual RID scenario_allocate() = 0;
 	virtual void scenario_initialize(RID p_rid) = 0;

@ -197,8 +201,8 @@ public:
 	virtual void sdfgi_set_debug_probe_select(const Vector3 &p_position, const Vector3 &p_dir) = 0;

 	virtual void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas) = 0;
-	virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
-	virtual void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
+	virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
+	virtual void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;

 	virtual void update() = 0;
 	virtual void render_probes() = 0;
--- a/servers/rendering/renderer_scene_cull.cpp
+++ b/servers/rendering/renderer_scene_cull.cpp
@ -109,6 +109,20 @@ bool RendererSceneCull::is_camera(RID p_camera) const {
 	return camera_owner.owns(p_camera);
 }

+/* OCCLUDER API */
+
+RID RendererSceneCull::occluder_allocate() {
+	return RendererSceneOcclusionCull::get_singleton()->occluder_allocate();
+}
+
+void RendererSceneCull::occluder_initialize(RID p_rid) {
+	RendererSceneOcclusionCull::get_singleton()->occluder_initialize(p_rid);
+}
+
+void RendererSceneCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {
+	RendererSceneOcclusionCull::get_singleton()->occluder_set_mesh(p_occluder, p_vertices, p_indices);
+}
+
 /* SCENARIO API */

 void RendererSceneCull::_instance_pair(Instance *p_A, Instance *p_B) {
@ -310,6 +324,8 @@ void RendererSceneCull::scenario_initialize(RID p_rid) {
 	scenario->instance_aabbs.set_page_pool(&instance_aabb_page_pool);
 	scenario->instance_data.set_page_pool(&instance_data_page_pool);

+	RendererSceneOcclusionCull::get_singleton()->add_scenario(p_rid);
+
 	scenario_owner.initialize_rid(p_rid, scenario);
 }

@ -497,6 +513,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 				scene_render->free(gi_probe->probe_instance);

 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (scenario && instance->visible) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance);
+				}
+			} break;
 			default: {
 			}
 		}
@ -514,6 +535,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {

 	if (p_base.is_valid()) {
 		instance->base_type = RSG::storage->get_base_type(p_base);
+
+		if (instance->base_type == RS::INSTANCE_NONE && RendererSceneOcclusionCull::get_singleton()->is_occluder(p_base)) {
+			instance->base_type = RS::INSTANCE_OCCLUDER;
+		}
+
 		ERR_FAIL_COND(instance->base_type == RS::INSTANCE_NONE);

 		switch (instance->base_type) {
@ -588,6 +614,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 				gi_probe->probe_instance = scene_render->gi_probe_instance_create(p_base);

 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (scenario) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, p_base, instance->transform, instance->visible);
+				}
+			} break;
 			default: {
 			}
 		}
@ -655,6 +686,11 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) {
 					gi_probe_update_list.remove(&gi_probe->update_element);
 				}
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (instance->visible) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance);
+				}
+			} break;
 			default: {
 			}
 		}
@ -684,6 +720,9 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) {
 					gi_probe_update_list.add(&gi_probe->update_element);
 				}
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, instance->base, instance->transform, instance->visible);
+			} break;
 			default: {
 			}
 		}
@ -801,6 +840,12 @@ void RendererSceneCull::instance_set_visible(RID p_instance, bool p_visible) {
 		InstanceParticlesCollisionData *collision = static_cast<InstanceParticlesCollisionData *>(instance->base_data);
 		RSG::storage->particles_collision_instance_set_active(collision->instance, p_visible);
 	}
+
+	if (instance->base_type == RS::INSTANCE_OCCLUDER) {
+		if (instance->scenario) {
+			RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(instance->scenario->self, p_instance, instance->base, instance->transform, p_visible);
+		}
+	}
 }

 inline bool is_geometry_instance(RenderingServer::InstanceType p_type) {
@ -998,6 +1043,18 @@ void RendererSceneCull::instance_geometry_set_flag(RID p_instance, RS::InstanceF
 			}

 		} break;
+		case RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING: {
+			instance->ignore_occlusion_culling = p_enabled;
+
+			if (instance->scenario && instance->array_index >= 0) {
+				InstanceData &idata = instance->scenario->instance_data[instance->array_index];
+				if (instance->ignore_occlusion_culling) {
+					idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING;
+				} else {
+					idata.flags &= ~uint32_t(InstanceData::FLAG_IGNORE_OCCLUSION_CULLING);
+				}
+			}
+		} break;
 		default: {
 		}
 	}
@ -1210,6 +1267,10 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 			heightfield_particle_colliders_update_list.insert(p_instance);
 		}
 		RSG::storage->particles_collision_instance_set_transform(collision->instance, p_instance->transform);
+	} else if (p_instance->base_type == RS::INSTANCE_OCCLUDER) {
+		if (p_instance->scenario) {
+			RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(p_instance->scenario->self, p_instance->self, p_instance->base, p_instance->transform, p_instance->visible);
+		}
 	}

 	if (p_instance->aabb.has_no_surface()) {
@ -1337,6 +1398,9 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 		if (p_instance->mesh_instance.is_valid()) {
 			idata.flags |= InstanceData::FLAG_USES_MESH_INSTANCE;
 		}
+		if (p_instance->ignore_occlusion_culling) {
+			idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING;
+		}

 		p_instance->scenario->instance_data.push_back(idata);
 		p_instance->scenario->instance_aabbs.push_back(InstanceBounds(p_instance->transformed_aabb));
@ -2119,7 +2183,7 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 	return animated_material_found;
 }

-void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
+void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
 // render to mono camera
 #ifndef _3D_DISABLED

@ -2164,11 +2228,14 @@ void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_

 	RID environment = _render_get_environment(p_camera, p_scenario);

-	_render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
+	RENDER_TIMESTAMP("Update occlusion buffer")
+	RendererSceneOcclusionCull::get_singleton()->buffer_update(p_viewport, camera->transform, camera_matrix, ortho, RendererThreadPool::singleton->thread_work_pool);
+
+	_render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_viewport, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
 #endif
 }

-void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
+void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
 	// render for AR/VR interface
 #if 0
 	Camera *camera = camera_owner.getornull(p_camera);
@ -2253,7 +2320,7 @@ void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_
 #endif
 };

-void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data) {
+void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, CullData *cull_data) {
 	uint32_t cull_total = cull_data->scenario->instance_data.size();
 	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
 	uint32_t cull_from = p_thread * cull_total / total_threads;
@ -2262,7 +2329,7 @@ void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullDat
 	_frustum_cull(*cull_data, frustum_cull_result_threads[p_thread], cull_from, cull_to);
 }

-void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) {
+void RendererSceneCull::_frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) {
 	uint64_t frame_number = RSG::rasterizer->get_frame_number();
 	float lightmap_probe_update_speed = RSG::storage->lightmap_get_probe_capture_update_speed() * RSG::rasterizer->get_frame_delta_time();

@ -2271,10 +2338,14 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes

 	RID instance_pair_buffer[MAX_INSTANCE_PAIRS];

+	Transform inv_cam_transform = cull_data.cam_transform.inverse();
+	float z_near = cull_data.camera_matrix->get_z_near();
+
 	for (uint64_t i = p_from; i < p_to; i++) {
 		bool mesh_visible = false;

-		if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum)) {
+		if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum) && (cull_data.occlusion_buffer == nullptr || cull_data.scenario->instance_data[i].flags & InstanceData::FLAG_IGNORE_OCCLUSION_CULLING ||
+																								 !cull_data.occlusion_buffer->is_occluded(cull_data.scenario->instance_aabbs[i].bounds, cull_data.cam_transform.origin, inv_cam_transform, *cull_data.camera_matrix, z_near))) {
 			InstanceData &idata = cull_data.scenario->instance_data[i];
 			uint32_t base_type = idata.flags & InstanceData::FLAG_BASE_TYPE_MASK;

@ -2469,7 +2540,7 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 	}
 }

-void RendererSceneCull::_render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) {
+void RendererSceneCull::_render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) {
 	// Note, in stereo rendering:
 	// - p_cam_transform will be a transform in the middle of our two eyes
 	// - p_cam_projection is a wider frustrum that encompasses both eyes
@ -2566,7 +2637,7 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 		uint64_t cull_from = 0;
 		uint64_t cull_to = scenario->instance_data.size();

-		FrustumCullData cull_data;
+		CullData cull_data;

 		//prepare for eventual thread usage
 		cull_data.cull = &cull;
@ -2575,6 +2646,8 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 		cull_data.cam_transform = p_cam_transform;
 		cull_data.visible_layers = p_visible_layers;
 		cull_data.render_reflection_probe = render_reflection_probe;
+		cull_data.occlusion_buffer = RendererSceneOcclusionCull::get_singleton()->buffer_get_ptr(p_viewport);
+		cull_data.camera_matrix = &p_cam_projection;
 //#define DEBUG_CULL_TIME
 #ifdef DEBUG_CULL_TIME
 		uint64_t time_from = OS::get_singleton()->get_ticks_usec();
@ -2781,8 +2854,13 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 	}
 	/* PROCESS GEOMETRY AND DRAW SCENE */

+	RID occluders_tex;
+	if (p_viewport.is_valid()) {
+		occluders_tex = RSG::viewport->viewport_get_occluder_debug_texture(p_viewport);
+	}
+
 	RENDER_TIMESTAMP("Render Scene ");
-	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data);
+	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, occluders_tex, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data);

 	for (uint32_t i = 0; i < max_shadows_used; i++) {
 		render_shadow_data[i].instances.clear();
@ -2829,7 +2907,7 @@ void RendererSceneCull::render_empty_scene(RID p_render_buffers, RID p_scenario,
 		environment = scenario->fallback_environment;
 	}
 	RENDER_TIMESTAMP("Render Empty Scene ");
-	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr);
+	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, RID(), scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr);
 #endif
 }

@ -2899,7 +2977,7 @@ bool RendererSceneCull::_render_reflection_probe_step(Instance *p_instance, int
 		}

 		RENDER_TIMESTAMP("Render Reflection Probe, Step " + itos(p_step));
-		_render_scene(xform, cm, false, false, RID(), environment, RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows);
+		_render_scene(xform, cm, false, false, RID(), environment, RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, RID(), shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows);

 	} else {
 		//do roughness postprocess step until it believes it's done
@ -3473,8 +3551,11 @@ bool RendererSceneCull::free(RID p_rid) {
 		scene_render->free(scenario->reflection_probe_shadow_atlas);
 		scene_render->free(scenario->reflection_atlas);
 		scenario_owner.free(p_rid);
+		RendererSceneOcclusionCull::get_singleton()->remove_scenario(p_rid);
 		memdelete(scenario);

+	} else if (RendererSceneOcclusionCull::get_singleton()->is_occluder(p_rid)) {
+		RendererSceneOcclusionCull::get_singleton()->free_occluder(p_rid);
 	} else if (instance_owner.owns(p_rid)) {
 		// delete the instance

@ -3543,6 +3624,8 @@ RendererSceneCull::RendererSceneCull() {
 	indexer_update_iterations = GLOBAL_GET("rendering/limits/spatial_indexer/update_iterations_per_frame");
 	thread_cull_threshold = GLOBAL_GET("rendering/limits/spatial_indexer/threaded_cull_minimum_instances");
 	thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)RendererThreadPool::singleton->thread_work_pool.get_thread_count()); //make sure there is at least one thread per CPU
+
+	dummy_occlusion_culling = memnew(RendererSceneOcclusionCull);
 }

 RendererSceneCull::~RendererSceneCull() {
@ -3561,4 +3644,8 @@ RendererSceneCull::~RendererSceneCull() {
 		frustum_cull_result_threads[i].reset();
 	}
 	frustum_cull_result_threads.clear();
+
+	if (dummy_occlusion_culling) {
+		memdelete(dummy_occlusion_culling);
+	}
 }
--- a/servers/rendering/renderer_scene_cull.h
+++ b/servers/rendering/renderer_scene_cull.h
@ -45,8 +45,10 @@
 #include "core/templates/rid_owner.h"
 #include "core/templates/self_list.h"
 #include "servers/rendering/renderer_scene.h"
+#include "servers/rendering/renderer_scene_occlusion_cull.h"
 #include "servers/rendering/renderer_scene_render.h"
 #include "servers/xr/xr_interface.h"
+
 class RendererSceneCull : public RendererScene {
 public:
 	RendererSceneRender *scene_render;
@ -109,6 +111,14 @@ public:
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable);
 	virtual bool is_camera(RID p_camera) const;

+	/* OCCLUDER API */
+
+	virtual RID occluder_allocate();
+	virtual void occluder_initialize(RID p_occluder);
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices);
+
+	RendererSceneOcclusionCull *dummy_occlusion_culling;
+
 	/* SCENARIO API */

 	struct Instance;
@ -248,6 +258,7 @@ public:
 			FLAG_USES_BAKED_LIGHT = (1 << 16),
 			FLAG_USES_MESH_INSTANCE = (1 << 17),
 			FLAG_REFLECTION_PROBE_DIRTY = (1 << 18),
+			FLAG_IGNORE_OCCLUSION_CULLING = (1 << 19),
 		};

 		uint32_t flags = 0;
@ -346,6 +357,8 @@ public:

 		float lod_bias;

+		bool ignore_occlusion_culling;
+
 		Vector<RID> materials;

 		RS::ShadowCastingSetting cast_shadows;
@ -470,6 +483,7 @@ public:
 			lightmap = nullptr;
 			lightmap_cull_index = 0;
 			lod_bias = 1.0;
+			ignore_occlusion_culling = false;

 			scenario = nullptr;

@ -921,24 +935,26 @@ public:
 		Frustum frustum;
 	} cull;

-	struct FrustumCullData {
+	struct CullData {
 		Cull *cull;
 		Scenario *scenario;
 		RID shadow_atlas;
 		Transform cam_transform;
 		uint32_t visible_layers;
 		Instance *render_reflection_probe;
+		const RendererSceneOcclusionCull::HZBuffer *occlusion_buffer;
+		const CameraMatrix *camera_matrix;
 	};

-	void _frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data);
-	void _frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to);
+	void _frustum_cull_threaded(uint32_t p_thread, CullData *cull_data);
+	void _frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to);

 	bool _render_reflection_probe_step(Instance *p_instance, int p_step);
-	void _render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true);
+	void _render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true);
 	void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas);

-	void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
-	void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
+	void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
+	void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
 	void update_dirty_instances();

 	void render_particle_colliders();
--- a/servers/rendering/renderer_scene_occlusion_cull.cpp
+++ b/servers/rendering/renderer_scene_occlusion_cull.cpp
@ -0,0 +1,192 @@
+/*************************************************************************/
+/*  renderer_scene_occlusion_cull.cpp                                    */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "renderer_scene_occlusion_cull.h"
+
+RendererSceneOcclusionCull *RendererSceneOcclusionCull::singleton = nullptr;
+
+const Vector3 RendererSceneOcclusionCull::HZBuffer::corners[8] = {
+	Vector3(0, 0, 0),
+	Vector3(0, 0, 1),
+	Vector3(0, 1, 0),
+	Vector3(0, 1, 1),
+	Vector3(1, 0, 0),
+	Vector3(1, 0, 1),
+	Vector3(1, 1, 0),
+	Vector3(1, 1, 1)
+};
+
+bool RendererSceneOcclusionCull::HZBuffer::is_empty() const {
+	return sizes.is_empty();
+}
+
+void RendererSceneOcclusionCull::HZBuffer::clear() {
+	if (sizes.is_empty()) {
+		return; // Already cleared
+	}
+
+	data.clear();
+	sizes.clear();
+	mips.clear();
+
+	debug_data.clear();
+	if (debug_image.is_valid()) {
+		debug_image.unref();
+	}
+	RS::get_singleton()->free(debug_texture);
+}
+
+void RendererSceneOcclusionCull::HZBuffer::resize(const Size2i &p_size) {
+	if (p_size == Size2i()) {
+		clear();
+		return;
+	}
+
+	if (!sizes.is_empty() && p_size == sizes[0]) {
+		return; // Size didn't change
+	}
+
+	int mip_count = 0;
+	int data_size = 0;
+	int w = p_size.x;
+	int h = p_size.y;
+
+	while (true) {
+		data_size += h * w;
+
+		w = MAX(1, w >> 1);
+		h = MAX(1, h >> 1);
+
+		mip_count++;
+
+		if (w == 1U && h == 1U) {
+			data_size += 1U;
+			mip_count++;
+			break;
+		}
+	}
+
+	data.resize(data_size);
+	mips.resize(mip_count);
+	sizes.resize(mip_count);
+
+	w = p_size.x;
+	h = p_size.y;
+	float *ptr = data.ptr();
+
+	for (int i = 0; i < mip_count; i++) {
+		sizes[i] = Size2i(w, h);
+		mips[i] = ptr;
+
+		ptr = &ptr[w * h];
+		w = MAX(1, w >> 1);
+		h = MAX(1, h >> 1);
+	}
+
+	for (int i = 0; i < data_size; i++) {
+		data[i] = FLT_MAX;
+	}
+
+	debug_data.resize(sizes[0].x * sizes[0].y);
+	if (debug_texture.is_valid()) {
+		RS::get_singleton()->free(debug_texture);
+		debug_texture = RID();
+	}
+}
+
+void RendererSceneOcclusionCull::HZBuffer::update_mips() {
+	if (sizes.is_empty()) {
+		return;
+	}
+
+	for (uint32_t mip = 1; mip < mips.size(); mip++) {
+		for (int y = 0; y < sizes[mip].y; y++) {
+			for (int x = 0; x < sizes[mip].x; x++) {
+				int prev_x = x * 2;
+				int prev_y = y * 2;
+
+				int prev_w = sizes[mip - 1].width;
+				int prev_h = sizes[mip - 1].height;
+
+				bool odd_w = (prev_w % 2) != 0;
+				bool odd_h = (prev_h % 2) != 0;
+
+#define CHECK_OFFSET(xx, yy) max_depth = MAX(max_depth, mips[mip - 1][MIN(prev_h - 1, prev_y + (yy)) * prev_w + MIN(prev_w - 1, prev_x + (xx))])
+
+				float max_depth = mips[mip - 1][prev_y * sizes[mip - 1].x + prev_x];
+				CHECK_OFFSET(0, 1);
+				CHECK_OFFSET(1, 0);
+				CHECK_OFFSET(1, 1);
+
+				if (odd_w) {
+					CHECK_OFFSET(2, 0);
+					CHECK_OFFSET(2, 1);
+				}
+
+				if (odd_h) {
+					CHECK_OFFSET(0, 2);
+					CHECK_OFFSET(1, 2);
+				}
+
+				if (odd_w && odd_h) {
+					CHECK_OFFSET(2, 2);
+				}
+
+				mips[mip][y * sizes[mip].x + x] = max_depth;
+#undef CHECK_OFFSET
+			}
+		}
+	}
+}
+
+RID RendererSceneOcclusionCull::HZBuffer::get_debug_texture() {
+	if (sizes.is_empty() || sizes[0] == Size2i()) {
+		return RID();
+	}
+
+	if (debug_image.is_null()) {
+		debug_image.instance();
+	}
+
+	unsigned char *ptrw = debug_data.ptrw();
+	for (int i = 0; i < debug_data.size(); i++) {
+		ptrw[i] = MIN(mips[0][i] / debug_tex_range, 1.0) * 255;
+	}
+
+	debug_image->create(sizes[0].x, sizes[0].y, false, Image::FORMAT_L8, debug_data);
+
+	if (debug_texture.is_null()) {
+		debug_texture = RS::get_singleton()->texture_2d_create(debug_image);
+	} else {
+		RenderingServer::get_singleton()->texture_2d_update_immediate(debug_texture, debug_image);
+	}
+
+	return debug_texture;
+}
--- a/servers/rendering/renderer_scene_occlusion_cull.h
+++ b/servers/rendering/renderer_scene_occlusion_cull.h
@ -0,0 +1,201 @@
+/*************************************************************************/
+/*  renderer_scene_occlusion_cull.h                                      */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef RENDERER_SCENE_OCCLUSION_CULL_H
+#define RENDERER_SCENE_OCCLUSION_CULL_H
+
+#include "core/math/camera_matrix.h"
+#include "core/templates/local_vector.h"
+#include "servers/rendering_server.h"
+
+class RendererSceneOcclusionCull {
+protected:
+	static RendererSceneOcclusionCull *singleton;
+
+public:
+	class HZBuffer {
+	protected:
+		static const Vector3 corners[8];
+
+		LocalVector<float> data;
+		LocalVector<Size2i> sizes;
+		LocalVector<float *> mips;
+
+		RID debug_texture;
+		Ref<Image> debug_image;
+		PackedByteArray debug_data;
+		float debug_tex_range = 0.0f;
+
+	public:
+		bool is_empty() const;
+		virtual void clear();
+		virtual void resize(const Size2i &p_size);
+
+		void update_mips();
+
+		_FORCE_INLINE_ bool is_occluded(const float p_bounds[6], const Vector3 &p_cam_position, const Transform &p_cam_inv_transform, const CameraMatrix &p_cam_projection, float p_near) const {
+			if (is_empty()) {
+				return false;
+			}
+
+			Vector3 closest_point = Vector3(CLAMP(p_cam_position.x, p_bounds[0], p_bounds[3]), CLAMP(p_cam_position.y, p_bounds[1], p_bounds[4]), CLAMP(p_cam_position.z, p_bounds[2], p_bounds[5]));
+
+			if (closest_point == p_cam_position) {
+				return false;
+			}
+
+			Vector3 closest_point_view = p_cam_inv_transform.xform(closest_point);
+			if (closest_point_view.z > -p_near) {
+				return false;
+			}
+
+			float min_depth;
+			if (p_cam_projection.is_orthogonal()) {
+				min_depth = (-closest_point_view.z) - p_near;
+			} else {
+				float r = -p_near / closest_point_view.z;
+				Vector3 closest_point_proj = Vector3(closest_point_view.x * r, closest_point_view.y * r, -p_near);
+				min_depth = closest_point_proj.distance_to(closest_point_view);
+			}
+
+			Vector2 rect_min = Vector2(FLT_MAX, FLT_MAX);
+			Vector2 rect_max = Vector2(FLT_MIN, FLT_MIN);
+
+			for (int j = 0; j < 8; j++) {
+				Vector3 c = RendererSceneOcclusionCull::HZBuffer::corners[j];
+				Vector3 nc = Vector3(1, 1, 1) - c;
+				Vector3 corner = Vector3(p_bounds[0] * c.x + p_bounds[3] * nc.x, p_bounds[1] * c.y + p_bounds[4] * nc.y, p_bounds[2] * c.z + p_bounds[5] * nc.z);
+				Vector3 view = p_cam_inv_transform.xform(corner);
+
+				Vector3 projected = p_cam_projection.xform(view);
+				Vector2 normalized = Vector2(projected.x * 0.5f + 0.5f, projected.y * 0.5f + 0.5f);
+				rect_min = rect_min.min(normalized);
+				rect_max = rect_max.max(normalized);
+			}
+
+			rect_max = rect_max.min(Vector2(1, 1));
+			rect_min = rect_min.max(Vector2(0, 0));
+
+			int mip_count = mips.size();
+
+			Vector2 screen_diagonal = (rect_max - rect_min) * sizes[0];
+			float size = MAX(screen_diagonal.x, screen_diagonal.y);
+			float l = Math::ceil(Math::log2(size));
+			int lod = CLAMP(l, 0, mip_count - 1);
+
+			const int max_samples = 512;
+			int sample_count = 0;
+			bool visible = true;
+
+			for (; lod >= 0; lod--) {
+				int w = sizes[lod].x;
+				int h = sizes[lod].y;
+
+				int minx = CLAMP(rect_min.x * w - 1, 0, w - 1);
+				int maxx = CLAMP(rect_max.x * w + 1, 0, w - 1);
+
+				int miny = CLAMP(rect_min.y * h - 1, 0, h - 1);
+				int maxy = CLAMP(rect_max.y * h + 1, 0, h - 1);
+
+				sample_count += (maxx - minx + 1) * (maxy - miny + 1);
+
+				if (sample_count > max_samples) {
+					return false;
+				}
+
+				visible = false;
+				for (int y = miny; y <= maxy; y++) {
+					for (int x = minx; x <= maxx; x++) {
+						float depth = mips[lod][y * w + x];
+						if (depth > min_depth) {
+							visible = true;
+							break;
+						}
+					}
+					if (visible) {
+						break;
+					}
+				}
+
+				if (!visible) {
+					return true;
+				}
+			}
+
+			return !visible;
+		}
+
+		RID get_debug_texture();
+
+		virtual ~HZBuffer(){};
+	};
+
+	static RendererSceneOcclusionCull *get_singleton() { return singleton; }
+
+	void _print_warining() {
+		WARN_PRINT_ONCE("Occlusion culling is disabled at build time.");
+	}
+
+	virtual bool is_occluder(RID p_rid) { return false; }
+	virtual RID occluder_allocate() { return RID(); }
+	virtual void occluder_initialize(RID p_occluder) {}
+	virtual void free_occluder(RID p_occluder) { _print_warining(); }
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) { _print_warining(); }
+
+	virtual void add_scenario(RID p_scenario) {}
+	virtual void remove_scenario(RID p_scenario) {}
+	virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) { _print_warining(); }
+	virtual void scenario_remove_instance(RID p_scenario, RID p_instance) { _print_warining(); }
+
+	virtual void add_buffer(RID p_buffer) { _print_warining(); }
+	virtual void remove_buffer(RID p_buffer) { _print_warining(); }
+	virtual HZBuffer *buffer_get_ptr(RID p_buffer) {
+		return nullptr;
+	}
+	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) { _print_warining(); }
+	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) { _print_warining(); }
+	virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {}
+	virtual RID buffer_get_debug_texture(RID p_buffer) {
+		_print_warining();
+		return RID();
+	}
+
+	virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {}
+
+	RendererSceneOcclusionCull() {
+		singleton = this;
+	};
+
+	virtual ~RendererSceneOcclusionCull() {
+		singleton = nullptr;
+	};
+};
+
+#endif //RENDERER_SCENE_OCCLUSION_CULL_H
--- a/servers/rendering/renderer_scene_render.h
+++ b/servers/rendering/renderer_scene_render.h
@ -216,7 +216,7 @@ public:
 		uint32_t positional_light_count;
 	};

-	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0;
+	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0;

 	virtual void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) = 0;
 	virtual void render_particle_collider_heightfield(RID p_collider, const Transform &p_transform, const PagedArray<GeometryInstance *> &p_instances) = 0;
--- a/servers/rendering/renderer_viewport.cpp
+++ b/servers/rendering/renderer_viewport.cpp
@ -79,11 +79,26 @@ void RendererViewport::_draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye) {
 		xr_interface = XRServer::get_singleton()->get_primary_interface();
 	}

+	if (p_viewport->use_occlusion_culling) {
+		if (p_viewport->occlusion_buffer_dirty) {
+			float aspect = p_viewport->size.aspect();
+			int max_size = occlusion_rays_per_thread * RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+
+			int viewport_size = p_viewport->size.width * p_viewport->size.height;
+			max_size = CLAMP(max_size, viewport_size / (32 * 32), viewport_size / (2 * 2)); // At least one depth pixel for every 16x16 region. At most one depth pixel for every 2x2 region.
+
+			float height = Math::sqrt(max_size / aspect);
+			Size2i new_size = Size2i(height * aspect, height);
+			RendererSceneOcclusionCull::get_singleton()->buffer_set_size(p_viewport->self, new_size);
+			p_viewport->occlusion_buffer_dirty = false;
+		}
+	}
+
 	float screen_lod_threshold = p_viewport->lod_threshold / float(p_viewport->size.width);
 	if (p_viewport->use_xr && xr_interface.is_valid()) {
-		RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
+		RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
 	} else {
-		RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
+		RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
 	}
 	RENDER_TIMESTAMP("<End Rendering 3D Scene");
 }
@ -647,6 +662,8 @@ void RendererViewport::viewport_set_size(RID p_viewport, int p_width, int p_heig
 			RSG::scene->render_buffers_configure(viewport->render_buffers, viewport->render_target, viewport->size.width, viewport->size.height, viewport->msaa, viewport->screen_space_aa, viewport->use_debanding);
 		}
 	}
+
+	viewport->occlusion_buffer_dirty = true;
 }

 void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) {
@ -655,6 +672,7 @@ void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) {

 	if (p_active) {
 		ERR_FAIL_COND(active_viewports.find(viewport) != -1); //already active
+		viewport->occlusion_buffer_dirty = true;
 		active_viewports.push_back(viewport);
 	} else {
 		active_viewports.erase(viewport);
@ -739,6 +757,16 @@ RID RendererViewport::viewport_get_texture(RID p_viewport) const {
 	return RSG::storage->render_target_get_texture(viewport->render_target);
 }

+RID RendererViewport::viewport_get_occluder_debug_texture(RID p_viewport) const {
+	const Viewport *viewport = viewport_owner.getornull(p_viewport);
+	ERR_FAIL_COND_V(!viewport, RID());
+
+	if (viewport->use_occlusion_culling && viewport->debug_draw == RenderingServer::VIEWPORT_DEBUG_DRAW_OCCLUDERS) {
+		return RendererSceneOcclusionCull::get_singleton()->buffer_get_debug_texture(p_viewport);
+	}
+	return RID();
+}
+
 void RendererViewport::viewport_set_hide_scenario(RID p_viewport, bool p_hide) {
 	Viewport *viewport = viewport_owner.getornull(p_viewport);
 	ERR_FAIL_COND(!viewport);
@ -772,6 +800,9 @@ void RendererViewport::viewport_set_scenario(RID p_viewport, RID p_scenario) {
 	ERR_FAIL_COND(!viewport);

 	viewport->scenario = p_scenario;
+	if (viewport->use_occlusion_culling) {
+		RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, p_scenario);
+	}
 }

 void RendererViewport::viewport_attach_canvas(RID p_viewport, RID p_canvas) {
@ -888,6 +919,41 @@ void RendererViewport::viewport_set_use_debanding(RID p_viewport, bool p_use_deb
 	}
 }

+void RendererViewport::viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling) {
+	Viewport *viewport = viewport_owner.getornull(p_viewport);
+	ERR_FAIL_COND(!viewport);
+
+	if (viewport->use_occlusion_culling == p_use_occlusion_culling) {
+		return;
+	}
+	viewport->use_occlusion_culling = p_use_occlusion_culling;
+
+	if (viewport->use_occlusion_culling) {
+		RendererSceneOcclusionCull::get_singleton()->add_buffer(p_viewport);
+		RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, viewport->scenario);
+	} else {
+		RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_viewport);
+	}
+
+	viewport->occlusion_buffer_dirty = true;
+}
+
+void RendererViewport::viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) {
+	if (occlusion_rays_per_thread == p_rays_per_thread) {
+		return;
+	}
+
+	occlusion_rays_per_thread = p_rays_per_thread;
+
+	for (int i = 0; i < active_viewports.size(); i++) {
+		active_viewports[i]->occlusion_buffer_dirty = true;
+	}
+}
+
+void RendererViewport::viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {
+	RendererSceneOcclusionCull::get_singleton()->set_build_quality(p_quality);
+}
+
 void RendererViewport::viewport_set_lod_threshold(RID p_viewport, float p_pixels) {
 	Viewport *viewport = viewport_owner.getornull(p_viewport);
 	ERR_FAIL_COND(!viewport);
@ -985,6 +1051,10 @@ bool RendererViewport::free(RID p_rid) {
 		viewport_set_scenario(p_rid, RID());
 		active_viewports.erase(viewport);

+		if (viewport->use_occlusion_culling) {
+			RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_rid);
+		}
+
 		viewport_owner.free(p_rid);
 		memdelete(viewport);

@ -1026,4 +1096,5 @@ void RendererViewport::call_set_use_vsync(bool p_enable) {
 }

 RendererViewport::RendererViewport() {
+	occlusion_rays_per_thread = GLOBAL_GET("rendering/occlusion_culling/occlusion_rays_per_thread");
 }
--- a/servers/rendering/renderer_viewport.h
+++ b/servers/rendering/renderer_viewport.h
@ -31,9 +31,9 @@
 #ifndef VISUALSERVERVIEWPORT_H
 #define VISUALSERVERVIEWPORT_H

+#include "core/templates/local_vector.h"
 #include "core/templates/rid_owner.h"
 #include "core/templates/self_list.h"
-#include "renderer_compositor.h"
 #include "servers/rendering_server.h"
 #include "servers/xr/xr_interface.h"

@ -61,6 +61,9 @@ public:
 		RS::ViewportScreenSpaceAA screen_space_aa;
 		bool use_debanding;

+		bool use_occlusion_culling;
+		bool occlusion_buffer_dirty;
+
 		DisplayServer::WindowID viewport_to_screen;
 		Rect2 viewport_to_screen_rect;
 		bool viewport_render_direct_to_screen;
@ -143,6 +146,8 @@ public:
 			msaa = RS::VIEWPORT_MSAA_DISABLED;
 			screen_space_aa = RS::VIEWPORT_SCREEN_SPACE_AA_DISABLED;
 			use_debanding = false;
+			use_occlusion_culling = false;
+			occlusion_buffer_dirty = true;

 			snap_2d_transforms_to_pixel = false;
 			snap_2d_vertices_to_pixel = false;
@ -185,6 +190,10 @@ private:
 	void _draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye);
 	void _draw_viewport(Viewport *p_viewport, XRInterface::Eyes p_eye = XRInterface::EYE_MONO);

+	int occlusion_rays_per_thread = 512;
+
+	void _resize_occlusion_culling_buffer(const Size2i &p_size);
+
 public:
 	RID viewport_allocate();
 	void viewport_initialize(RID p_rid);
@ -204,6 +213,7 @@ public:
 	void viewport_set_clear_mode(RID p_viewport, RS::ViewportClearMode p_clear_mode);

 	RID viewport_get_texture(RID p_viewport) const;
+	RID viewport_get_occluder_debug_texture(RID p_viewport) const;

 	void viewport_set_hide_scenario(RID p_viewport, bool p_hide);
 	void viewport_set_hide_canvas(RID p_viewport, bool p_hide);
@ -225,7 +235,9 @@ public:
 	void viewport_set_msaa(RID p_viewport, RS::ViewportMSAA p_msaa);
 	void viewport_set_screen_space_aa(RID p_viewport, RS::ViewportScreenSpaceAA p_mode);
 	void viewport_set_use_debanding(RID p_viewport, bool p_use_debanding);
-
+	void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling);
+	void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread);
+	void viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality);
 	void viewport_set_lod_threshold(RID p_viewport, float p_pixels);

 	virtual int viewport_get_render_info(RID p_viewport, RS::ViewportRenderInfo p_info);
--- a/servers/rendering/rendering_server_default.h
+++ b/servers/rendering/rendering_server_default.h
@ -540,6 +540,10 @@ public:
 	FUNC2(camera_set_camera_effects, RID, RID)
 	FUNC2(camera_set_use_vertical_aspect, RID, bool)

+	/* OCCLUDER */
+	FUNCRIDSPLIT(occluder)
+	FUNC3(occluder_set_mesh, RID, const PackedVector3Array &, const PackedInt32Array &);
+
 #undef server_name
 #undef ServerName
 //from now on, calls forwarded to this singleton
@ -590,6 +594,9 @@ public:
 	FUNC2(viewport_set_msaa, RID, ViewportMSAA)
 	FUNC2(viewport_set_screen_space_aa, RID, ViewportScreenSpaceAA)
 	FUNC2(viewport_set_use_debanding, RID, bool)
+	FUNC2(viewport_set_use_occlusion_culling, RID, bool)
+	FUNC1(viewport_set_occlusion_rays_per_thread, int)
+	FUNC1(viewport_set_occlusion_culling_build_quality, ViewportOcclusionCullingBuildQuality)
 	FUNC2(viewport_set_lod_threshold, RID, float)

 	FUNC2R(int, viewport_get_render_info, RID, ViewportRenderInfo)
--- a/servers/rendering_server.cpp
+++ b/servers/rendering_server.cpp
@ -1594,7 +1594,7 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("gi_probe_set_compress", "probe", "enable"), &RenderingServer::gi_probe_set_compress);
 	ClassDB::bind_method(D_METHOD("gi_probe_is_compressed", "probe"), &RenderingServer::gi_probe_is_compressed);
 #endif
-/*
+	/*
 	ClassDB::bind_method(D_METHOD("lightmap_create()"), &RenderingServer::lightmap_capture_create);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_set_bounds", "capture", "bounds"), &RenderingServer::lightmap_capture_set_bounds);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_get_bounds", "capture"), &RenderingServer::lightmap_capture_get_bounds);
@ -1607,6 +1607,10 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("lightmap_capture_set_energy", "capture", "energy"), &RenderingServer::lightmap_capture_set_energy);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_get_energy", "capture"), &RenderingServer::lightmap_capture_get_energy);
 */
+
+	ClassDB::bind_method(D_METHOD("occluder_create"), &RenderingServer::occluder_create);
+	ClassDB::bind_method(D_METHOD("occluder_set_mesh"), &RenderingServer::occluder_set_mesh);
+
 #endif
 	ClassDB::bind_method(D_METHOD("particles_create"), &RenderingServer::particles_create);
 	ClassDB::bind_method(D_METHOD("particles_set_emitting", "particles", "emitting"), &RenderingServer::particles_set_emitting);
@ -1667,6 +1671,9 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("viewport_set_shadow_atlas_quadrant_subdivision", "viewport", "quadrant", "subdivision"), &RenderingServer::viewport_set_shadow_atlas_quadrant_subdivision);
 	ClassDB::bind_method(D_METHOD("viewport_set_msaa", "viewport", "msaa"), &RenderingServer::viewport_set_msaa);
 	ClassDB::bind_method(D_METHOD("viewport_set_use_debanding", "viewport", "enable"), &RenderingServer::viewport_set_use_debanding);
+	ClassDB::bind_method(D_METHOD("viewport_set_use_occlusion_culling", "viewport", "enable"), &RenderingServer::viewport_set_use_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("viewport_set_occlusion_rays_per_thread", "rays_per_thread"), &RenderingServer::viewport_set_occlusion_rays_per_thread);
+	ClassDB::bind_method(D_METHOD("viewport_set_occlusion_culling_build_quality", "quality"), &RenderingServer::viewport_set_occlusion_culling_build_quality);

 	ClassDB::bind_method(D_METHOD("viewport_get_render_info", "viewport", "info"), &RenderingServer::viewport_get_render_info);
 	ClassDB::bind_method(D_METHOD("viewport_set_debug_draw", "viewport", "draw"), &RenderingServer::viewport_set_debug_draw);
@ -1694,6 +1701,7 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("scenario_create"), &RenderingServer::scenario_create);
 	ClassDB::bind_method(D_METHOD("scenario_set_debug", "scenario", "debug_mode"), &RenderingServer::scenario_set_debug);
 	ClassDB::bind_method(D_METHOD("scenario_set_environment", "scenario", "environment"), &RenderingServer::scenario_set_environment);
+	ClassDB::bind_method(D_METHOD("scenario_set_camera_effects", "scenario", "effects"), &RenderingServer::scenario_set_camera_effects);
 	ClassDB::bind_method(D_METHOD("scenario_set_fallback_environment", "scenario", "environment"), &RenderingServer::scenario_set_fallback_environment);

 #ifndef _3D_DISABLED
@ -2024,6 +2032,7 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI);
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI_PROBES);
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_GI_BUFFER);
+	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_OCCLUDERS);

 	BIND_ENUM_CONSTANT(SKY_MODE_QUALITY);
 	BIND_ENUM_CONSTANT(SKY_MODE_REALTIME);
@ -2093,6 +2102,10 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(SCENARIO_DEBUG_OVERDRAW);
 	BIND_ENUM_CONSTANT(SCENARIO_DEBUG_SHADELESS);

+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW);
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM);
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH);
+
 	BIND_ENUM_CONSTANT(INSTANCE_NONE);
 	BIND_ENUM_CONSTANT(INSTANCE_MESH);
 	BIND_ENUM_CONSTANT(INSTANCE_MULTIMESH);
@ -2104,12 +2117,14 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(INSTANCE_DECAL);
 	BIND_ENUM_CONSTANT(INSTANCE_GI_PROBE);
 	BIND_ENUM_CONSTANT(INSTANCE_LIGHTMAP);
+	BIND_ENUM_CONSTANT(INSTANCE_OCCLUDER);
 	BIND_ENUM_CONSTANT(INSTANCE_MAX);
 	BIND_ENUM_CONSTANT(INSTANCE_GEOMETRY_MASK);

 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_BAKED_LIGHT);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_DYNAMIC_GI);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE);
+	BIND_ENUM_CONSTANT(INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_MAX);

 	BIND_ENUM_CONSTANT(SHADOW_CASTING_SETTING_OFF);
@ -2340,6 +2355,10 @@ RenderingServer::RenderingServer() {
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/amount", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/amount", PROPERTY_HINT_RANGE, "0.01,4.0,0.01"));
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/limit", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/limit", PROPERTY_HINT_RANGE, "0.01,1.0,0.01"));

+	GLOBAL_DEF_RST("rendering/occlusion_culling/occlusion_rays_per_thread", 512);
+	GLOBAL_DEF_RST("rendering/occlusion_culling/bvh_build_quality", 2);
+	ProjectSettings::get_singleton()->set_custom_property_info("rendering/occlusion_culling/bvh_build_quality", PropertyInfo(Variant::INT, "rendering/occlusion_culling/bvh_build_quality", PROPERTY_HINT_ENUM, "Low,Medium,High"));
+
 	GLOBAL_DEF("rendering/environment/glow/upscale_mode", 1);
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/environment/glow/upscale_mode", PropertyInfo(Variant::INT, "rendering/environment/glow/upscale_mode", PROPERTY_HINT_ENUM, "Linear (Fast),Bicubic (Slow)"));
 	GLOBAL_DEF("rendering/environment/glow/upscale_mode.mobile", 0);
--- a/servers/rendering_server.h
+++ b/servers/rendering_server.h
@ -713,6 +713,11 @@ public:
 	virtual void camera_set_camera_effects(RID p_camera, RID p_camera_effects) = 0;
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0;

+	/* OCCLUDER API */
+
+	virtual RID occluder_create() = 0;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0;
+
 	/* VIEWPORT TARGET API */

 	enum CanvasItemTextureFilter {
@ -826,6 +831,17 @@ public:

 	virtual void viewport_set_lod_threshold(RID p_viewport, float p_pixels) = 0;

+	virtual void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_debanding) = 0;
+	virtual void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) = 0;
+
+	enum ViewportOcclusionCullingBuildQuality {
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW = 0,
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM = 1,
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH = 2,
+	};
+
+	virtual void viewport_set_occlusion_culling_build_quality(ViewportOcclusionCullingBuildQuality p_quality) = 0;
+
 	enum ViewportRenderInfo {
 		VIEWPORT_RENDER_INFO_OBJECTS_IN_FRAME,
 		VIEWPORT_RENDER_INFO_VERTICES_IN_FRAME,
@ -862,6 +878,7 @@ public:
 		VIEWPORT_DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 		VIEWPORT_DEBUG_DRAW_CLUSTER_DECALS,
 		VIEWPORT_DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+		VIEWPORT_DEBUG_DRAW_OCCLUDERS,
 	};

 	virtual void viewport_set_debug_draw(RID p_viewport, ViewportDebugDraw p_draw) = 0;
@ -1109,6 +1126,7 @@ public:
 		INSTANCE_DECAL,
 		INSTANCE_GI_PROBE,
 		INSTANCE_LIGHTMAP,
+		INSTANCE_OCCLUDER,
 		INSTANCE_MAX,

 		INSTANCE_GEOMETRY_MASK = (1 << INSTANCE_MESH) | (1 << INSTANCE_MULTIMESH) | (1 << INSTANCE_IMMEDIATE) | (1 << INSTANCE_PARTICLES)
@ -1147,6 +1165,7 @@ public:
 		INSTANCE_FLAG_USE_BAKED_LIGHT,
 		INSTANCE_FLAG_USE_DYNAMIC_GI,
 		INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE,
+		INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING,
 		INSTANCE_FLAG_MAX
 	};

@ -1505,6 +1524,7 @@ VARIANT_ENUM_CAST(RenderingServer::ViewportMSAA);
 VARIANT_ENUM_CAST(RenderingServer::ViewportScreenSpaceAA);
 VARIANT_ENUM_CAST(RenderingServer::ViewportRenderInfo);
 VARIANT_ENUM_CAST(RenderingServer::ViewportDebugDraw);
+VARIANT_ENUM_CAST(RenderingServer::ViewportOcclusionCullingBuildQuality);
 VARIANT_ENUM_CAST(RenderingServer::SkyMode);
 VARIANT_ENUM_CAST(RenderingServer::EnvironmentBG);
 VARIANT_ENUM_CAST(RenderingServer::EnvironmentAmbientSource);
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -62,6 +62,26 @@ Files extracted from upstream source:
 Extracted from .zip provided. Extracted license and header only.


+## embree-aarch64
+
+- Upstream: https://github.com/lighttransport/embree-aarch64
+- Version: 3.12.1 (6ef362f99af80c9dfe8dd2bfc582d9067897edc6, 2020)
+- License: Apache 2.0
+
+Files extracted from upstream:
+
+- All cpp files listed in `modules/raycast/godot_update_embree.py`
+- All header files in the directories listed in `modules/raycast/godot_update_embree.py`
+
+The `modules/raycast/godot_update_embree.py`script can be used to pull the 
+relevant files from the latest Embree-aarch64 release and apply some automatic changes.
+
+Some changes have been made in order to remove exceptions and fix minor build errors.
+They are marked with `// -- GODOT start --` and `// -- GODOT end --`
+comments. Apply the patches in the `patches/` folder when syncing on newer upstream
+commits.
+
+
 ## enet

 - Upstream: http://enet.bespin.org
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
@ -0,0 +1,55 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+  
+  template<typename Index, class UnaryPredicate>
+    __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+  {
+    bool ret = false;
+    
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+        if (context.is_group_execution_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            context.cancel_group_execution();
+          }
+        }
+      });
+#else
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+        if (tbb::task::self().is_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            tbb::task::self().cancel_group_execution();
+          }
+        }
+      });
+#endif
+#else
+    ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+        bool localret = false;
+        for (auto i=r.begin(); i<r.end(); ++i) {
+          localret |= pred(i);
+        }
+        return localret;
+      },
+      std::bit_or<bool>()
+      );
+#endif
+    
+    return ret;
+  }
+  
+} // end namespace
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
@ -0,0 +1,56 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_filter.h"
+#include "../sys/regression.h"
+#include <map>
+
+namespace embree
+{
+  struct parallel_filter_regression_test : public RegressionTest
+  {
+    parallel_filter_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; };
+      
+      for (size_t N=10; N<1000000; N=size_t(2.1*N))
+      {
+        size_t N0 = rand() % N;
+        
+	/* initialize array with random numbers */
+	std::vector<uint32_t> src(N);
+        std::map<uint32_t,int> m;
+	for (size_t i=0; i<N; i++) src[i] = rand();
+
+        /* count elements up */
+	for (size_t i=N0; i<N; i++)
+          if (pred(src[i]))
+            m[src[i]] = 0;
+        for (size_t i=N0; i<N; i++)
+          if (pred(src[i]))
+            m[src[i]]++;
+
+        /* filter array */
+        //size_t M = sequential_filter(src.data(),N0,N,pred);
+        size_t M = parallel_filter(src.data(),N0,N,size_t(1024),pred);
+        
+	/* check if filtered data is correct */
+	for (size_t i=N0; i<M; i++) {
+          passed &= pred(src[i]);
+          m[src[i]]--;
+        }
+	for (size_t i=N0; i<M; i++)
+          passed &= (m[src[i]] == 0);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_filter_regression_test parallel_filter_regression("parallel_filter_regression");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
@ -0,0 +1,93 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+  {
+    Index j = first;
+    for (Index i=first; i<last; i++)
+      if (predicate(data[i]))
+        data[j++] = data[i];
+
+    return j;
+  }
+
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+  {
+    /* sequential fallback */
+    if (end-begin <= minStepSize)
+      return sequential_filter(data,begin,end,predicate);
+
+    /* calculate number of tasks to use */
+    enum { MAX_TASKS = 64 };
+    const Index numThreads = TaskScheduler::threadCount();
+    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
+    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+    /* filter blocks */
+    Index nused[MAX_TASKS];
+    Index nfree[MAX_TASKS];
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+      const Index i2 = sequential_filter(data,i0,i1,predicate);
+      nused[taskIndex] = i2-i0;
+      nfree[taskIndex] = i1-i2;
+    });
+
+    /* calculate offsets */
+    Index sused=0;
+    Index sfree=0;
+    Index pfree[MAX_TASKS];
+    for (Index i=0; i<taskCount; i++) 
+    {
+      sused+=nused[i];
+      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+    }
+
+    /* return if we did not filter out any element */
+    assert(sfree <= end-begin);
+    assert(sused <= end-begin);
+    if (sused == end-begin)
+      return end;
+
+    /* otherwise we have to copy misplaced elements around */
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      /* destination to write elements to */
+      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+      if (dst_end <= dst) return;
+
+      /* range of misplaced elements to copy to destination */
+      Index r0 = pfree[taskIndex];
+      Index r1 = r0+dst_end-dst;
+
+      /* find range in misplaced elements in back to front order */
+      Index k0=0;
+      for (Index i=taskCount-1; i>0; i--)
+      {
+        if (k0 > r1) break;
+        Index k1 = k0+nused[i];
+        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+          Index isrc = src-i+k0-1;
+          assert(dst >= begin && dst < end);
+          assert(isrc >= begin && isrc < end);
+          data[dst++] = data[isrc];
+        }
+        k0 = k1;
+      }
+    });
+
+    return begin+sused;
+  }
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
@ -0,0 +1,48 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_regression_test : public RegressionTest
+  {
+    parallel_for_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      const size_t M = 10;
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+        /* sequentially calculate sum of squares */
+        size_t sum0 = 0;
+        for (size_t i=0; i<N; i++) {
+          sum0 += i*i;
+        }
+
+        /* parallel calculation of sum of squares */
+        for (size_t m=0; m<M; m++)
+        {
+          std::atomic<size_t> sum1(0);
+          parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range<size_t>& r) 
+          {
+            size_t s = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+              s += i*i;
+            sum1 += s;
+          });
+          passed = sum0 == sum1;
+        }
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_for_regression_test parallel_for_regression("parallel_for_regression_test");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
@ -0,0 +1,229 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/math.h"
+#include "../math/range.h"
+
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include <dispatch/dispatch.h>
+#include <algorithm>
+#include <type_traits>
+#endif
+
+namespace embree
+{
+  /* parallel_for without range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index N, const Func& func)
+  {
+#if defined(TASKING_INTERNAL)
+    if (N) {
+      TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+          assert(r.size() == 1);
+          func(r.begin());
+        });
+      if (!TaskScheduler::wait())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    }
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
+    const size_t length = N;
+    const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = (currentBlock * blockSize);
+        const size_t blockLength = std::min(length - start, blockSize);
+        const size_t end = start + blockLength;
+          
+        for(size_t i=start; i < end; i++)
+        {
+            func(i);
+        }
+    });
+      
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { 
+        func(i);
+      });
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range and granulatity */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+  {
+    assert(first <= last);
+#if defined(TASKING_INTERNAL)
+    TaskScheduler::spawn(first,last,minStepSize,func);
+    if (!TaskScheduler::wait())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
+    const size_t length = last - first;
+    const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
+    blockSize += blockSize % 4;
+      
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = first + (currentBlock * blockSize);
+        const size_t end = std::min<size_t>(last, start + blockSize);
+          
+        func( embree::range<Index>(start,end) );
+    });
+      
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { 
+        func(range<Index>(i,i+1)); 
+      });
+
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+  {
+    assert(first <= last);
+    parallel_for(first,last,(Index)1,func);
+  }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner(),context);
+      if (context.is_group_execution_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner());
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #endif
+  }
+
+  typedef tbb::affinity_partitioner affinity_partitioner;
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap,context);
+      if (context.is_group_execution_cancelled())
+       // -- GODOT start --
+       // throw std::runtime_error("task cancelled");
+       abort(); 
+       // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap);
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #endif
+  }
+
+#else
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func) 
+  {
+    parallel_for(N,func);
+  }
+
+  struct affinity_partitioner {
+  };
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) 
+  {
+    parallel_for(N,func);
+  }
+
+#endif
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
@ -0,0 +1,63 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for_for.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_for_regression_test : public RegressionTest
+  {
+    parallel_for_for_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      size_t sum0 = 0;
+      size_t K = 0;
+      const size_t M = 1000;
+      std::vector<std::vector<size_t>* > array2(M);
+      for (size_t i=0; i<M; i++) {
+        const size_t N = rand() % 1024;
+        K+=N;
+        array2[i] = new std::vector<size_t>(N);
+        for (size_t j=0; j<N; j++) 
+          sum0 += (*array2[i])[j] = rand();
+      }
+
+      /* array to test global index */
+      std::vector<atomic<size_t>> verify_k(K);
+      for (size_t i=0; i<K; i++) verify_k[i].store(0);
+
+      /* add all numbers using parallel_for_for */
+      std::atomic<size_t> sum1(0);
+      parallel_for_for( array2, size_t(1), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+	  s += (*v)[i];
+          verify_k[k++]++;
+        }
+        sum1 += s;
+	return sum1;
+      });
+      passed &= (sum0 == sum1);
+
+      /* check global index */
+      for (size_t i=0; i<K; i++) 
+        passed &= (verify_k[i] == 1);
+
+      /* delete vectors again */
+      for (size_t i=0; i<array2.size(); i++)
+	delete array2[i];
+      
+      return passed;
+    }
+  };
+
+  parallel_for_for_regression_test parallel_for_for_regression("parallel_for_for_regression_test");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
@ -0,0 +1,149 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename ArrayArray, typename Func>
+    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
+  {
+    size_t k=0;
+    for (size_t i=0; i!=array2.size(); ++i) {
+      const size_t N = array2[i]->size();
+      if (N) func(array2[i],range<size_t>(0,N),k);
+      k+=N;
+    }
+  }
+
+  class ParallelForForState
+  {
+  public:
+
+    enum { MAX_TASKS = 64 };
+
+    __forceinline ParallelForForState () 
+      : taskCount(0) {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+      init(array2,minStepSize);
+    } 
+
+    template<typename ArrayArray>
+      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    {
+      /* first calculate total number of elements */
+      size_t N = 0;
+      for (size_t i=0; i<array2.size(); i++) {
+	N += array2[i] ? array2[i]->size() : 0;
+      }
+      this->N = N;
+
+      /* calculate number of tasks to use */
+      const size_t numThreads = TaskScheduler::threadCount();
+      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
+      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+      
+      /* calculate start (i,j) for each task */
+      size_t taskIndex = 0;
+      i0[taskIndex] = 0;
+      j0[taskIndex] = 0;
+      size_t k0 = (++taskIndex)*N/taskCount;
+      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
+      {
+	assert(i<array2.size());
+	size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+	  assert(taskIndex<taskCount);
+	  i0[taskIndex] = i;
+	  j0[taskIndex] = j += k0-k;
+	  k=k0;
+	  k0 = (++taskIndex)*N/taskCount;
+	}
+	k+=M-j;
+      }
+    }
+
+    __forceinline size_t size() const {
+      return N;
+    }
+    
+  public:
+    size_t i0[MAX_TASKS];
+    size_t j0[MAX_TASKS];
+    size_t taskCount;
+    size_t N;
+  };
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+  {
+    ParallelForForState state(array2,minStepSize);
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+        k+=r1-r0; j0 = 0;
+      }
+    });
+  }
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+  {
+    parallel_for_for(array2,1,func);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    ParallelForForState state(array2,minStepSize);
+    Value temp[ParallelForForState::MAX_TASKS];
+
+    for (size_t i=0; i<state.taskCount; i++)
+      temp[i] = identity;
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+        k+=r1-r0; j0 = 0;
+      }
+    });
+
+    Value ret = identity;
+    for (size_t i=0; i<state.taskCount; i++)
+      ret = reduction(ret,temp[i]);
+    return ret;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_reduce(array2,1,identity,func,reduction);
+  }
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for_for_prefix_sum.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_for_prefix_sum_regression_test : public RegressionTest
+  {
+    parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      const size_t M = 10;
+      std::vector<atomic<size_t>> flattened;
+      typedef std::vector<std::vector<size_t>* > ArrayArray;
+      ArrayArray array2(M);
+      size_t K = 0;
+      for (size_t i=0; i<M; i++) {
+        const size_t N = rand() % 10;
+        K += N;
+        array2[i] = new std::vector<size_t>(N);
+        for (size_t j=0; j<N; j++) 
+          (*array2[i])[j] = rand() % 10;
+      }
+  
+      /* array to test global index */
+      std::vector<atomic<size_t>> verify_k(K);
+      for (size_t i=0; i<K; i++) verify_k[i].store(0);
+
+      ParallelForForPrefixSumState<size_t> state(array2,size_t(1));
+  
+      /* dry run only counts */
+      size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+          s += (*v)[i];
+          verify_k[k++]++;
+        }
+        return s;
+      }, [](size_t v0, size_t v1) { return v0+v1; });
+      
+      /* create properly sized output array */
+      flattened.resize(S);
+      for (auto& a : flattened) a.store(0);
+
+      /* now we actually fill the flattened array */
+      parallel_for_for_prefix_sum1( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i, const size_t base) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+          for (size_t j=0; j<(*v)[i]; j++) {
+            flattened[base+s+j]++;
+          }
+          s += (*v)[i];
+          verify_k[k++]++;
+        }
+        return s;
+      }, [](size_t v0, size_t v1) { return v0+v1; });
+
+      /* check global index */
+      for (size_t i=0; i<K; i++) 
+        passed &= (verify_k[i] == 2);
+
+      /* check if each element was assigned exactly once */
+      for (size_t i=0; i<flattened.size(); i++)
+        passed &= (flattened[i] == 1);
+      
+      /* delete arrays again */
+      for (size_t i=0; i<array2.size(); i++)
+	delete array2[i];
+
+      return passed;
+    }
+  };
+
+  parallel_for_for_prefix_sum_regression_test parallel_for_for_prefix_sum_regression("parallel_for_for_prefix_sum_regression_test");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
@ -0,0 +1,112 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelForForPrefixSumState : public ParallelForForState
+  {
+    __forceinline ParallelForForPrefixSumState () {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+      : ParallelForForState(array2,minStepSize) {}
+
+    ParallelPrefixSumState<Value> prefix_state;
+  };
+  
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+  }
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_map.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_map_regression_test : public RegressionTest
+  {
+    parallel_map_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create key/value vectors with random numbers */
+      const size_t N = 10000;
+      std::vector<uint32_t> keys(N);
+      std::vector<uint32_t> vals(N);
+      for (size_t i=0; i<N; i++) keys[i] = 2*unsigned(i)*647382649;
+      for (size_t i=0; i<N; i++) std::swap(keys[i],keys[rand()%N]);
+      for (size_t i=0; i<N; i++) vals[i] = 2*rand();
+      
+      /* create map */
+      parallel_map<uint32_t,uint32_t> map;
+      map.init(keys,vals);
+
+      /* check that all keys are properly mapped */
+      for (size_t i=0; i<N; i++) {
+        const uint32_t* val = map.lookup(keys[i]);
+        passed &= val && (*val == vals[i]);
+      }
+
+      /* check that these keys are not in the map */
+      for (size_t i=0; i<N; i++) {
+        passed &= !map.lookup(keys[i]+1);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_map_regression_test parallel_map_regression("parallel_map_regression_test");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /*! implementation of a key/value map with parallel construction */
+  template<typename Key, typename Val>
+  class parallel_map
+  {
+    /* key/value pair to build the map */
+    struct KeyValue
+    {
+      __forceinline KeyValue () {}
+
+      __forceinline KeyValue (const Key key, const Val val)
+	: key(key), val(val) {}
+
+      __forceinline operator Key() const {
+	return key;
+      }
+
+    public:
+      Key key;
+      Val val;
+    };
+
+  public:
+    
+    /*! parallel map constructors */
+    parallel_map () {}
+
+    /*! construction from pair of vectors */
+    template<typename KeyVector, typename ValVector>
+      parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+    /*! initialized the parallel map from a vector with keys and values */
+    template<typename KeyVector, typename ValVector>
+      void init(const KeyVector& keys, const ValVector& values) 
+    {
+      /* reserve sufficient space for all data */
+      assert(keys.size() == values.size());
+      vec.resize(keys.size());
+      
+      /* generate key/value pairs */
+      parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++)
+	  vec[i] = KeyValue((Key)keys[i],values[i]);
+      });
+
+      /* perform parallel radix sort of the key/value pairs */
+      std::vector<KeyValue> temp(keys.size());
+      radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+    }
+
+    /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+    __forceinline const Val* lookup(const Key& key) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return nullptr;
+      if (i->key != key) return nullptr;
+      return &i->val;
+    }
+
+    /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+    __forceinline Val lookup(const Key& key, const Val& def) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return def;
+      if (i->key != key) return def;
+      return i->val;
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<KeyValue> vec;    //!< vector containing sorted elements
+  };
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
@ -0,0 +1,53 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_partition.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_partition_regression_test : public RegressionTest
+  {
+    parallel_partition_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      for (size_t i=0; i<100; i++)
+      {
+        /* create random permutation */
+        size_t N = std::rand() % 1000000;
+        std::vector<unsigned> array(N);
+        for (unsigned i=0; i<N; i++) array[i] = i;
+        for (auto& v : array) std::swap(v,array[std::rand()%array.size()]);
+        size_t split = std::rand() % (N+1);
+
+        /* perform parallel partitioning */
+        size_t left_sum = 0, right_sum = 0;
+        size_t mid = parallel_partitioning(array.data(),0,array.size(),0,left_sum,right_sum,
+                                           [&] ( size_t i ) { return i < split; },
+                                           []  ( size_t& sum, unsigned v) { sum += v; },
+                                           []  ( size_t& sum, size_t v) { sum += v; },
+                                           128);
+        
+        /*serial_partitioning(array.data(),0,array.size(),left_sum,right_sum,
+                            [&] ( size_t i ) { return i < split; },
+                            []  ( size_t& left_sum, int v) { left_sum += v; });*/
+
+        /* verify result */
+        passed &= mid == split;
+        passed &= left_sum == split*(split-1)/2;
+        passed &= right_sum == N*(N-1)/2-left_sum;
+        for (size_t i=0; i<split; i++) passed &= array[i] < split;
+        for (size_t i=split; i<N; i++) passed &= array[i] >= split;
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
@ -0,0 +1,283 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* serial partitioning */
+  template<typename T, typename V, typename IsLeft, typename Reduction_T>
+    __forceinline size_t serial_partitioning(T* array, 
+                                             const size_t begin,
+                                             const size_t end, 
+                                             V& leftReduction,
+                                             V& rightReduction,
+                                             const IsLeft& is_left, 
+                                             const Reduction_T& reduction_t)
+  {
+    T* l = array + begin;
+    T* r = array + end - 1;
+    
+    while(1)
+    {
+      /* *l < pivot */
+      while (likely(l <= r && is_left(*l) )) 
+      {
+        //prefetchw(l+4); // FIXME: enable?
+        reduction_t(leftReduction,*l);
+        ++l;
+      }
+      /* *r >= pivot) */
+      while (likely(l <= r && !is_left(*r)))
+      {
+        //prefetchw(r-4); FIXME: enable?
+        reduction_t(rightReduction,*r);
+        --r;
+      }
+      if (r<l) break;
+      
+      reduction_t(leftReduction ,*r);
+      reduction_t(rightReduction,*l);
+      xchg(*l,*r);
+      l++; r--;
+    }
+    
+    return l - array;        
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    class __aligned(64) parallel_partition_task
+  {
+    ALIGNED_CLASS_(64);
+  private:
+
+    static const size_t MAX_TASKS = 64;
+
+    T* array;
+    size_t N;
+    const IsLeft& is_left;
+    const Reduction_T& reduction_t;
+    const Reduction_V& reduction_v;
+    const Vi& identity;
+
+    size_t numTasks; 
+    __aligned(64) size_t counter_start[MAX_TASKS+1]; 
+    __aligned(64) size_t counter_left[MAX_TASKS+1];  
+    __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];  
+    __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; 
+    __aligned(64) V leftReductions[MAX_TASKS];           
+    __aligned(64) V rightReductions[MAX_TASKS];    
+
+  public:
+     
+    __forceinline parallel_partition_task(T* array, 
+                                          const size_t N, 
+                                          const Vi& identity, 
+                                          const IsLeft& is_left, 
+                                          const Reduction_T& reduction_t, 
+                                          const Reduction_V& reduction_v,
+                                          const size_t BLOCK_SIZE) 
+
+      : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+      numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+    __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+    {
+      size_t i = 0;
+      while(index >= (size_t)r[i].size())
+      {
+        assert(i < numRanges);
+        index -= (size_t)r[i].size();
+        i++;
+      }	    
+      return &r[i];
+    }
+
+    __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+                                                  const size_t numRightMisplacedRanges,
+                                                  const size_t startID,
+                                                  const size_t endID)
+    {
+      size_t leftLocalIndex  = startID;
+      size_t rightLocalIndex = startID;
+      const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+      const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+      
+      size_t l_left = l_range->size() - leftLocalIndex;
+      size_t r_left = r_range->size() - rightLocalIndex;
+      T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+      T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+      size_t size = endID - startID;
+      size_t items = min(size,min(l_left,r_left)); 
+     
+      while (size)
+      {
+        if (unlikely(l_left == 0))
+        {
+          l_range++;
+          l_left = l_range->size();
+          l = &array[l_range->begin()];
+          items = min(size,min(l_left,r_left));
+        }
+
+        if (unlikely(r_left == 0))
+        {		
+          r_range++;
+          r_left = r_range->size();
+          r = &array[r_range->begin()];          
+          items = min(size,min(l_left,r_left));
+        }
+
+        size   -= items;
+        l_left -= items;
+        r_left -= items;
+
+        while(items) {
+          items--;
+          xchg(*l++,*r++);
+        }
+      }
+    }
+
+    __forceinline size_t partition(V& leftReduction, V& rightReduction)
+    {
+      /* partition the individual ranges for each task */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*N/numTasks;
+          const size_t endID   = (taskID+1)*N/numTasks;
+          V local_left(identity);
+          V local_right(identity);
+          const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+          counter_start[taskID] = startID;
+          counter_left [taskID] = mid-startID;
+          leftReductions[taskID]  = local_left;
+          rightReductions[taskID] = local_right;
+        });
+      counter_start[numTasks] = N;
+      counter_left[numTasks]  = 0;
+      
+      /* finalize the reductions */
+      for (size_t i=0; i<numTasks; i++) {
+        reduction_v(leftReduction,leftReductions[i]);
+        reduction_v(rightReduction,rightReductions[i]);
+      }
+
+      /* calculate mid point for partitioning */
+      size_t mid = counter_left[0];
+      for (size_t i=1; i<numTasks; i++)
+        mid += counter_left[i];
+      const range<ssize_t> globalLeft (0,mid);
+      const range<ssize_t> globalRight(mid,N);
+
+      /* calculate all left and right ranges that are on the wrong global side */
+      size_t numMisplacedRangesLeft  = 0;
+      size_t numMisplacedRangesRight = 0;
+      size_t numMisplacedItemsLeft   = 0;
+      size_t numMisplacedItemsRight  = 0;
+
+      for (size_t i=0; i<numTasks; i++)
+      {	    
+        const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+        const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+        const range<ssize_t> left_misplaced  = globalLeft. intersect(right_range);
+        const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+        if (!left_misplaced.empty())  
+        {
+          numMisplacedItemsLeft += left_misplaced.size();
+          leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+        }
+
+        if (!right_misplaced.empty()) 
+        {
+          numMisplacedItemsRight += right_misplaced.size();
+          rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+        }
+      }
+      assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+      /* if no items are misplaced we are done */
+      if (numMisplacedItemsLeft == 0)
+        return mid;
+
+      /* otherwise we copy the items to the right place in parallel */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+          const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks;
+          swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);	                             
+        });
+
+      return mid;
+    }
+  };
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE = 128)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < BLOCK_SIZE))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE,
+                                            size_t PARALLEL_THRESHOLD)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < PARALLEL_THRESHOLD))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+
+  template<typename T, typename IsLeft>
+    inline size_t parallel_partitioning(T* array, 
+                                        const size_t begin,
+                                        const size_t end, 
+                                        const IsLeft& is_left, 
+                                        size_t BLOCK_SIZE = 128)
+  {
+    size_t leftReduction = 0;
+    size_t rightReduction = 0;
+    return parallel_partitioning(
+      array,begin,end,0,leftReduction,rightReduction,is_left,
+      [] (size_t& t,const T& ref) {  },
+      [] (size_t& t0,size_t& t1) { },
+      BLOCK_SIZE);
+  }
+
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
@ -0,0 +1,48 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_prefix_sum.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_prefix_sum_regression_test : public RegressionTest
+  {
+    parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      const size_t M = 10;
+      
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+	/* initialize array with random numbers */
+        uint32_t sum0 = 0;
+	std::vector<uint32_t> src(N);
+	for (size_t i=0; i<N; i++) {
+	  sum0 += src[i] = rand();
+        }
+        
+	/* calculate parallel prefix sum */
+	std::vector<uint32_t> dst(N);
+	for (auto& v : dst) v = 0;
+	
+	for (size_t i=0; i<M; i++) {
+	  uint32_t sum1 = parallel_prefix_sum(src,dst,N,0,std::plus<uint32_t>());
+          passed &= (sum0 == sum1);
+        }
+        
+	/* check if prefix sum is correct */
+	for (size_t i=0, sum=0; i<N; sum+=src[i++])
+	  passed &= ((uint32_t)sum == dst[i]);
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_prefix_sum_regression_test parallel_prefix_sum_regression("parallel_prefix_sum_regression");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelPrefixSumState 
+  {
+    enum { MAX_TASKS = 64 };
+    Value counts[MAX_TASKS];
+    Value sums  [MAX_TASKS];
+  };
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t numThreads = TaskScheduler::threadCount();
+    const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
+    const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+      const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+      state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++) 
+    {
+      const Value c = state.counts[i];
+      state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  /*! parallel calculation of prefix sums */
+  template<typename SrcArray, typename DstArray, typename Value, typename Add>
+    __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 
+  {
+    /* perform single threaded prefix operation for small N */
+    if (N < SINGLE_THREAD_THRESHOLD) 
+    {
+      Value sum=identity;
+      for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+      return sum;
+    }
+    
+    /* perform parallel prefix operation for large N */
+    else 
+    {
+      ParallelPrefixSumState<Value> state;
+      
+      /* initial run just sets up start values for subtasks */
+      parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+          return s;
+          
+        }, add);
+      
+      /* final run calculates prefix sum */
+      return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            dst[i] = add(sum,s);
+            s = add(s,src[i]);
+          }
+          return s;
+          
+        }, add);
+    }
+  }
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_reduce.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_reduce_regression_test : public RegressionTest
+  {
+    parallel_reduce_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      const size_t M = 10;
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+        /* sequentially calculate sum of squares */
+        size_t sum0 = 0;
+        for (size_t i=0; i<N; i++) {
+          sum0 += i*i;
+        }
+
+        /* parallel calculation of sum of squares */
+        for (size_t m=0; m<M; m++)
+        {
+          size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t
+          {
+            size_t s = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+              s += i*i;
+            return s;
+          }, 
+          [](const size_t v0, const size_t v1) {
+            return v0+v1;
+          });
+          passed = sum0 == sum1;
+        }
+      }
+      return passed;
+    }
+  };
+
+  parallel_reduce_regression_test parallel_reduce_regression("parallel_reduce_regression_test");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
@ -0,0 +1,150 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    const Index maxTasks = 512;
+    const Index threadCount = (Index) TaskScheduler::threadCount();
+    taskCount = min(taskCount,threadCount,maxTasks);
+
+    /* parallel invokation of all tasks */
+    dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+    parallel_for(taskCount, [&](const Index taskIndex) {
+        const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+        const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+        values[taskIndex] = func(range<Index>(k0,k1));
+      });
+
+    /* perform reduction over all tasks */
+    Value v = identity;
+    for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+    return v;
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
+
+    /* fast path for small number of iterations */
+    Index taskCount = (last-first+minStepSize-1)/minStepSize;
+    if (likely(taskCount == 1)) {
+      return func(range<Index>(first,last));
+    }
+    return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction,context);
+    // -- GODOT start --
+    // if (context.is_group_execution_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #else
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction);
+    // -- GODOT start --
+    // if (tbb::task::self().is_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #endif
+#else // TASKING_PPL
+    struct AlignedValue
+    {
+      char storage[__alignof(Value)+sizeof(Value)];
+      static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+      Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+      AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+      AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+      AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      operator Value() const { return *getValuePtr(); }
+    };
+    
+    struct Iterator_Index
+    {
+      Index v;
+      typedef std::forward_iterator_tag iterator_category;
+      typedef AlignedValue value_type;
+      typedef Index difference_type;
+      typedef Index distance_type;
+      typedef AlignedValue* pointer;
+      typedef AlignedValue& reference;
+      __forceinline Iterator_Index() {}
+      __forceinline Iterator_Index(Index v) : v(v) {}
+      __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+      __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+      __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+      __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+    };
+    
+    auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+      assert(begin.v < end.v);
+      return reduction(start, func(range<Index>(begin.v, end.v)));
+    };
+    const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+    return v;
+#endif
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    if (likely(last-first < parallel_threshold)) {
+      return func(range<Index>(first,last)); 
+    } else {
+      return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+    }
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    auto funcr = [&] ( const range<Index> r ) {
+      Value v = identity;
+      for (Index i=r.begin(); i<r.end(); i++)
+        v = reduction(v,func(i));
+      return v;
+    };
+    return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+  }
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
@ -0,0 +1,43 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_set.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_set_regression_test : public RegressionTest
+  {
+    parallel_set_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      const size_t N = 10000;
+      std::vector<uint32_t> unsorted(N);
+      for (size_t i=0; i<N; i++) unsorted[i] = 2*rand();
+      
+      /* created set from numbers */
+      parallel_set<uint32_t> sorted;
+      sorted.init(unsorted);
+
+      /* check that all elements are in the set */
+      for (size_t i=0; i<N; i++) {
+	passed &= sorted.lookup(unsorted[i]);
+      }
+
+      /* check that these elements are not in the set */
+      for (size_t i=0; i<N; i++) {
+	passed &= !sorted.lookup(unsorted[i]+1);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_set_regression_test parallel_set_regression("parallel_set_regression_test");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
@ -0,0 +1,52 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /* implementation of a set of values with parallel construction */
+  template<typename T>
+  class parallel_set
+  {
+  public:
+
+    /*! default constructor for the parallel set */
+    parallel_set () {}
+
+    /*! construction from vector */
+    template<typename Vector>
+      parallel_set (const Vector& in) { init(in); }
+
+    /*! initialized the parallel set from a vector */
+    template<typename Vector>
+      void init(const Vector& in) 
+    {
+      /* copy data to internal vector */
+      vec.resize(in.size());
+      parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++) 
+	  vec[i] = in[i];
+      });
+
+      /* sort the data */
+      std::vector<T> temp(in.size());
+      radix_sort<T>(vec.data(),temp.data(),vec.size());
+    }
+
+    /*! tests if some element is in the set */
+    __forceinline bool lookup(const T& elt) const {
+      return std::binary_search(vec.begin(), vec.end(), elt);
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<T> vec;   //!< vector containing sorted elements
+  };
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
@ -0,0 +1,50 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_sort.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  template<typename Key>
+  struct RadixSortRegressionTest : public RegressionTest
+  {
+    RadixSortRegressionTest(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      const size_t M = 10;
+
+      for (size_t N=10; N<1000000; N=size_t(2.1*N))
+      {
+	std::vector<Key> src(N); memset(src.data(),0,N*sizeof(Key));
+	std::vector<Key> tmp(N); memset(tmp.data(),0,N*sizeof(Key));
+	for (size_t i=0; i<N; i++) src[i] = uint64_t(rand())*uint64_t(rand());
+	
+	/* calculate checksum */
+	Key sum0 = 0; for (size_t i=0; i<N; i++) sum0 += src[i];
+        
+	/* sort numbers */
+	for (size_t i=0; i<M; i++) {
+          radix_sort<Key>(src.data(),tmp.data(),N);
+        }
+	
+	/* calculate checksum */
+	Key sum1 = 0; for (size_t i=0; i<N; i++) sum1 += src[i];
+	if (sum0 != sum1) passed = false;
+        
+	/* check if numbers are sorted */
+	for (size_t i=1; i<N; i++)
+	  passed &= src[i-1] <= src[i];
+      }
+      
+      return passed;
+    }
+  };
+
+  RadixSortRegressionTest<uint32_t> test_u32("RadixSortRegressionTestU32");
+  RadixSortRegressionTest<uint64_t> test_u64("RadixSortRegressionTestU64");
+}
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
@ -0,0 +1,457 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include "../sys/alloc.h"
+#endif
+#include <algorithm>
+
+namespace embree
+{
+  template<class T>
+    __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v < array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T>
+    __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v > array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T> 
+    void quicksort_ascending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] > pivotvalue);
+        while (t[++left] < pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_ascending(t, begin, pivot);
+      quicksort_ascending(t, pivot + 1, end);
+    }
+  }
+  
+  template<class T> 
+    void quicksort_decending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] < pivotvalue);
+        while (t[++left] > pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_decending(t, begin, pivot);
+      quicksort_decending(t, pivot + 1, end);
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_ascending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_ascending<T>(&t[begin],size);
+      }
+      else
+      {
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] > pivotvalue);
+          while (t[++left] < pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_decending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_decending<T>(&t[begin],size);
+      }
+      else
+      {
+        
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] < pivotvalue);
+          while (t[++left] > pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  template<typename T>
+    static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+  {
+    static const unsigned int BITS = 8;
+    static const unsigned int BUCKETS = (1 << BITS);
+    static const unsigned int CMP_SORT_THRESHOLD = 16;
+    
+    __aligned(64) unsigned int count[BUCKETS];
+    
+    /* clear buckets */
+    for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+    
+    /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+    for (size_t i=0;i<num;i++)
+      count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+    
+    /* prefix sums */
+    __aligned(64) unsigned int head[BUCKETS];
+    __aligned(64) unsigned int tail[BUCKETS];
+    
+    head[0] = 0;
+    for (size_t i=1; i<BUCKETS; i++)    
+      head[i] = head[i-1] + count[i-1];
+    
+    for (size_t i=0; i<BUCKETS-1; i++)    
+      tail[i] = head[i+1];
+    
+    tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+    
+    assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);      
+    assert(tail[BUCKETS-1] == num);      
+    
+    /* in-place swap */      
+    for (size_t i=0;i<BUCKETS;i++)
+    {
+      /* process bucket */
+      while(head[i] < tail[i])
+      {
+        T v = morton[head[i]];
+        while(1)
+        {
+          const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+          if (b == i) break;
+          std::swap(v,morton[head[b]++]);
+        }
+        assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+        morton[head[i]++] = v;
+      }
+    }
+    if (shift == 0) return;
+    
+    size_t offset = 0;
+    for (size_t i=0;i<BUCKETS;i++)
+      if (count[i])
+      {
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+        
+        if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+          insertionsort_ascending(morton + offset, count[i]);
+        else
+          radixsort32(morton + offset, count[i], shift-BITS);
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(morton[j] <= morton[j+1]);
+        
+        offset += count[i];
+      }      
+  }    
+
+  template<typename Ty, typename Key>
+    class ParallelRadixSort
+  {
+    static const size_t MAX_TASKS = 64;
+    static const size_t BITS = 8;
+    static const size_t BUCKETS = (1 << BITS);
+    typedef unsigned int TyRadixCount[BUCKETS];
+    
+    template<typename T>
+      static bool compare(const T& v0, const T& v1) {
+      return (Key)v0 < (Key)v1;
+    }
+
+  private:
+    ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+    ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+    
+  public:
+    ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+      : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+    void sort(const size_t blockSize)
+    {
+      assert(blockSize > 0);
+      
+      /* perform single threaded sort for small N */
+      if (N<=blockSize) // handles also special case of 0!
+      {	  
+        /* do inplace sort inside destination array */
+        std::sort(src,src+N,compare<Ty>);
+      }
+      
+      /* perform parallel sort for large N */
+      else 
+      {
+        const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+        tbbRadixSort(numThreads);
+      }
+    }
+
+    ~ParallelRadixSort()
+    {
+      alignedFree(radixCount); 
+      radixCount = nullptr;
+    }
+    
+  private:
+    
+    void tbbRadixIteration0(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* count how many items go into the buckets */
+      for (size_t i=0; i<BUCKETS; i++)
+        radixCount[threadIndex][i] = 0;
+
+      /* iterate over src array and count buckets */
+      unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll      
+#endif
+      for (size_t i=startID; i<endID; i++) {
+#if defined(__X86_64__) || defined(__aarch64__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+        count[index]++;
+      }
+    }
+    
+    void tbbRadixIteration1(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* calculate total number of items for each bucket */
+      __aligned(64) unsigned int total[BUCKETS];
+      /*
+      for (size_t i=0; i<BUCKETS; i++)
+        total[i] = 0;
+      */
+      for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+        vintx::store(&total[i], zero);
+      
+      for (size_t i=0; i<threadCount; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          total[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* calculate start offset of each bucket */
+      __aligned(64) unsigned int offset[BUCKETS];
+      offset[0] = 0;
+      for (size_t i=1; i<BUCKETS; i++)    
+        offset[i] = offset[i-1] + total[i-1];
+      
+      /* calculate start offset of each bucket for this thread */
+      for (size_t i=0; i<threadIndex; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          offset[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+      for (size_t i=startID; i<endID; i++) {
+        const Ty elt = src[i];
+#if defined(__X86_64__) || defined(__aarch64__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+        dst[offset[index]++] = elt;
+      }
+    }
+    
+    void tbbRadixIteration(const Key shift, const bool last,
+                           const Ty* __restrict src, Ty* __restrict dst,
+                           const size_t numTasks)
+    {
+      affinity_partitioner ap;
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+    }
+    
+    void tbbRadixSort(const size_t numTasks)
+    {
+      radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+      
+      if (sizeof(Key) == sizeof(uint32_t)) {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+      }
+      else if (sizeof(Key) == sizeof(uint64_t))
+      {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+      }
+    }
+    
+  private:
+    TyRadixCount* radixCount;
+    Ty* const src;
+    Ty* const tmp;
+    const size_t N;
+  };
+
+  template<typename Ty>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty, typename Key>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+  }
+}
--- a/thirdparty/embree-aarch64/common/lexers/parsestream.h
+++ b/thirdparty/embree-aarch64/common/lexers/parsestream.h
@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stringstream.h"
+#include "../sys/filename.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/col3.h"
+#include "../math/color.h"
+
+namespace embree
+{
+  /*! helper class for simple command line parsing */
+  class ParseStream : public Stream<std::string>
+  {
+  public:
+    ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {}
+
+    ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false)
+      : cin(new StringStream(cin,seps,endl,multiLine)) {}
+
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next() { return cin->get(); }
+
+    void force(const std::string& next) {
+      std::string token = getString();
+      if (token != next)
+        THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found");
+    }
+
+    std::string getString() {
+      return get();
+    }
+
+    FileName getFileName()  {
+      return FileName(get());
+    }
+
+    int   getInt  () {
+      return atoi(get().c_str());
+    }
+
+    Vec2i getVec2i() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      return Vec2i(x,y);
+    }
+
+    Vec3ia getVec3ia() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      int z = atoi(get().c_str());
+      return Vec3ia(x,y,z);
+    }
+
+    float getFloat() {
+      return (float)atof(get().c_str());
+    }
+
+    Vec2f getVec2f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      return Vec2f(x,y);
+    }
+
+    Vec3f getVec3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3f(x,y,z);
+    }
+
+    Vec3fa getVec3fa() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3fa(x,y,z);
+    }
+
+    Col3f getCol3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Col3f(x,y,z);
+    }
+
+    Color getColor() {
+      float r = (float)atof(get().c_str());
+      float g = (float)atof(get().c_str());
+      float b = (float)atof(get().c_str());
+      return Color(r,g,b);
+    }
+
+  private:
+    Ref<Stream<std::string> > cin;
+  };
+}
--- a/thirdparty/embree-aarch64/common/lexers/stream.h
+++ b/thirdparty/embree-aarch64/common/lexers/stream.h
@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/ref.h"
+#include "../sys/filename.h"
+#include "../sys/string.h"
+
+#include <vector>
+#include <iostream>
+#include <cstdio>
+#include <string.h>
+
+namespace embree
+{
+  /*! stores the location of a stream element in the source */
+  class ParseLocation
+  {
+  public:
+    ParseLocation () : lineNumber(-1), colNumber(-1) {}
+    ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/)
+      : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {}
+
+    std::string str() const
+    {
+      std::string str = "unknown";
+      if (fileName) str = *fileName;
+      if (lineNumber >= 0) str += " line " + toString(lineNumber);
+      if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber);
+      return str;
+    }
+
+  private:
+    std::shared_ptr<std::string> fileName;         /// name of the file (or stream) the token is from
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+  };
+
+  /*! a stream class templated over the stream elements */
+  template<typename T> class Stream : public RefCount
+  {
+    enum { BUF_SIZE = 1024 };
+    
+  private:
+    virtual T next() = 0;
+    virtual ParseLocation location() = 0;
+    __forceinline std::pair<T,ParseLocation> nextHelper() {
+      ParseLocation l = location();
+      T v = next();
+      return std::pair<T,ParseLocation>(v,l);
+    }
+    __forceinline void push_back(const std::pair<T,ParseLocation>& v) {
+      if (past+future == BUF_SIZE) pop_front();
+      size_t end = (start+past+future++)%BUF_SIZE;
+      buffer[end] = v;
+    }
+    __forceinline void pop_front() {
+      if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty");
+      start = (start+1)%BUF_SIZE; past--;
+    }
+  public:
+    Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {}
+    virtual ~Stream() {}
+    
+  public:
+    
+    const ParseLocation& loc() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].second;
+    }
+    T get() {
+      if (future == 0) push_back(nextHelper());
+      T t = buffer[(start+past)%BUF_SIZE].first;
+      past++; future--;
+      return t;
+    }
+    const T& peek() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].first;
+    }
+    const T& unget(size_t n = 1) {
+      if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items");
+      past -= n; future += n;
+      return peek();
+    }
+    void drop() {
+      if (future == 0) push_back(nextHelper());
+      past++; future--;
+    }
+  private:
+    size_t start,past,future;
+    std::vector<std::pair<T,ParseLocation> > buffer;
+  };
+  
+  /*! warps an iostream stream */
+  class StdStream : public Stream<int>
+  {
+  public:
+    StdStream (std::istream& cin, const std::string& name = "std::stream")
+      : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+    ~StdStream() {}
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+    int next() {
+      int c = cin.get();
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+  private:
+    std::istream& cin;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a file */
+  class FileStream : public Stream<int>
+  {
+  public:
+
+    FileStream (FILE* file, const std::string& name = "file")
+      : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+
+    FileStream (const FileName& fileName)
+      : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
+    {
+      file = fopen(fileName.c_str(),"r");
+      if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+    }
+    ~FileStream() { if (file) fclose(file); }
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = fgetc(file);
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    FILE* file;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a string */
+  class StrStream : public Stream<int>
+  {
+  public:
+
+    StrStream (const char* str)
+      : str(str), lineNumber(1), colNumber(0), charNumber(0) {}
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = str[charNumber];
+      if (c == 0) return EOF;
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    const char* str;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+  };
+
+  /*! creates a character stream from a command line */
+  class CommandLineStream : public Stream<int>
+  {
+  public:
+    CommandLineStream (int argc, char** argv, const std::string& name = "command line")
+      : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name)))
+    {
+      if (argc > 0) {
+	for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++;
+	charNumber++;
+      }
+      for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]);
+    }
+    ~CommandLineStream() {}
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,0,charNumber,charNumber);
+    }
+    int next() {
+      if (i == args.size()) return EOF;
+      if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; }
+      charNumber++;
+      return args[i][j++];
+    }
+  private:
+    size_t i,j;
+    std::vector<std::string> args;
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+}
--- a/thirdparty/embree-aarch64/common/lexers/streamfilters.h
+++ b/thirdparty/embree-aarch64/common/lexers/streamfilters.h
@ -0,0 +1,39 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /* removes all line comments from a stream */
+  class LineCommentFilter : public Stream<int>
+  {
+  public:
+    LineCommentFilter (const FileName& fileName, const std::string& lineComment)
+      : cin(new FileStream(fileName)), lineComment(lineComment) {}
+    LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment)
+      : cin(cin), lineComment(lineComment) {}
+
+    ParseLocation location() { return cin->loc(); }
+
+    int next()
+    {
+      /* look if the line comment starts here */
+      for (size_t j=0; j<lineComment.size(); j++) {
+        if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; }
+        cin->get();
+      }
+      /* eat all characters until the end of the line (or file) */
+      while (cin->peek() != '\n' && cin->peek() != EOF) cin->get();
+
+    not_found:
+      return cin->get();
+    }
+
+  private:
+    Ref<Stream<int> > cin;
+    std::string lineComment;
+  };
+}
--- a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
+++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
@ -0,0 +1,51 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stringstream.h"
+
+namespace embree
+{
+  static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+  
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* simple tokenizer */
+  StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine)
+    : cin(cin), endl(endl), multiLine(multiLine)
+  {
+    createCharMap(isSepMap,seps);
+    createCharMap(isValidCharMap,stringChars);
+  }
+
+  std::string StringStream::next()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF) {
+      if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; }
+      if (multiLine && cin->peek() == '\\') {
+        cin->drop();
+        if (cin->peek() == '\n') { cin->drop(); continue; }
+        cin->unget();
+      }
+      if (!isSeparator(cin->peek())) break;
+      cin->drop();
+    }
+
+    /* parse everything until the next separator */
+    std::vector<char> str; str.reserve(64);
+    while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+      int c = cin->get();
+      // -- GODOT start --
+      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      if (!isValidChar(c)) abort();
+      // -- GODOT end --
+      str.push_back((char)c);
+    }
+    str.push_back(0);
+    return std::string(str.data());
+  }
+}
--- a/thirdparty/embree-aarch64/common/lexers/stringstream.h
+++ b/thirdparty/embree-aarch64/common/lexers/stringstream.h
@ -0,0 +1,29 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /*! simple tokenizer that produces a string stream */
+  class StringStream : public Stream<std::string>
+  {
+  public:
+    StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false);
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next();
+  private:
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+    __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; }
+  private:
+    Ref<Stream<int> > cin; /*! source character stream */
+    bool isSepMap[256];    /*! map for fast classification of separators */
+    bool isValidCharMap[256];  /*! map for valid characters */
+    std::string endl;      /*! the token of the end of line */
+    bool multiLine;        /*! whether to parse lines wrapped with \ */
+  };
+}
--- a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp
+++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp
@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tokenstream.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  /* shorthands for common sets of characters */
+  const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz";
+  const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  const std::string TokenStream::numbers = "0123456789";
+  const std::string TokenStream::separators = "\n\t\r ";
+  const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* build full tokenizer that takes list of valid characters and keywords */
+  TokenStream::TokenStream(const Ref<Stream<int> >& cin,            //< stream to read from
+                                   const std::string& alpha,                //< valid characters for identifiers
+                                   const std::string& seps,                 //< characters that act as separators
+                                   const std::vector<std::string>& symbols) //< symbols
+    : cin(cin), symbols(symbols)
+  {
+    createCharMap(isAlphaMap,alpha);
+    createCharMap(isSepMap,seps);
+    createCharMap(isStringCharMap,stringChars);
+  }
+
+  bool TokenStream::decDigits(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get();
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str;
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::decDigits1(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str; else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::trySymbol(const std::string& symbol)
+  {
+    size_t pos = 0;
+    while (pos < symbol.size()) {
+      if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; }
+      cin->drop(); pos++;
+    }
+    return true;
+  }
+
+  bool TokenStream::trySymbols(Token& token, const ParseLocation& loc)
+  {
+    for (size_t i=0; i<symbols.size(); i++) {
+      if (!trySymbol(symbols[i])) continue;
+      token = Token(symbols[i],Token::TY_SYMBOL,loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryFloat(Token& token, const ParseLocation& loc)
+  {
+    bool ok = false;
+    std::string str;
+    if (trySymbol("nan")) {
+      token = Token(float(nan));
+      return true;
+    }
+    if (trySymbol("+inf")) {
+      token = Token(float(pos_inf));
+      return true;
+    }
+    if (trySymbol("-inf")) {
+      token = Token(float(neg_inf));
+      return true;
+    }
+
+    if (decDigits(str))
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        decDigits(str);
+        if (cin->peek() == 'e' || cin->peek() == 'E') {
+          str += (char)cin->get();
+          if (decDigits(str)) ok = true; // 1.[2]E2
+        }
+        else ok = true; // 1.[2]
+      }
+      else if (cin->peek() == 'e' || cin->peek() == 'E') {
+        str += (char)cin->get();
+        if (decDigits(str)) ok = true; // 1E2
+      }
+    }
+    else
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        if (decDigits(str)) {
+          if (cin->peek() == 'e' || cin->peek() == 'E') {
+            str += (char)cin->get();
+            if (decDigits(str)) ok = true; // .3E2
+          }
+          else ok = true; // .3
+        }
+      }
+    }
+    if (ok) {
+      token = Token((float)atof(str.c_str()),loc);
+    }
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::tryInt(Token& token, const ParseLocation& loc) {
+    std::string str;
+    if (decDigits(str)) {
+      token = Token(atoi(str.c_str()),loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryString(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (cin->peek() != '\"') return false;
+    cin->drop();
+    while (cin->peek() != '\"') {
+      const int c = cin->get();
+      if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str());
+      str += (char)c;
+    }
+    cin->drop();
+    token = Token(str,Token::TY_STRING,loc);
+    return true;
+  }
+
+  bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (!isAlpha(cin->peek())) return false;
+    str += (char)cin->get();
+    while (isAlphaNum(cin->peek())) str += (char)cin->get();
+    token = Token(str,Token::TY_IDENTIFIER,loc);
+    return true;
+  }
+
+  void TokenStream::skipSeparators()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF && isSeparator(cin->peek()))
+      cin->drop();
+  }
+
+  Token TokenStream::next()
+  {
+    Token token;
+    skipSeparators();
+    ParseLocation loc = cin->loc();
+    if (trySymbols   (token,loc)) return token;      /**< try to parse a symbol */
+    if (tryFloat     (token,loc)) return token;      /**< try to parse float */
+    if (tryInt       (token,loc)) return token;      /**< try to parse integer */
+    if (tryString    (token,loc)) return token;      /**< try to parse string */
+    if (tryIdentifier(token,loc)) return token;      /**< try to parse identifier */
+    if (cin->peek() == EOF  )     return Token(loc); /**< return EOF token */
+    return Token((char)cin->get(),loc);              /**< return invalid character token */
+  }
+}
--- a/thirdparty/embree-aarch64/common/lexers/tokenstream.h
+++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.h
@ -0,0 +1,164 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+#include <string>
+#include <vector>
+
+namespace embree
+{
+  /*! token class */
+  class Token
+  {
+  public:
+
+    enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL };
+
+    Token (        const ParseLocation& loc = ParseLocation()) : ty(TY_EOF  ),       loc(loc) {}
+    Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {}
+    Token (int i,  const ParseLocation& loc = ParseLocation()) : ty(TY_INT  ), i(i), loc(loc) {}
+    Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {}
+    Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty),   str(str), loc(loc) {}
+
+    static Token Eof()                { return Token(); }
+    static Token Sym(std::string str) { return Token(str,TY_SYMBOL); }
+    static Token Str(std::string str) { return Token(str,TY_STRING); }
+    static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); }
+
+    char Char() const {
+      if (ty == TY_CHAR) return c;
+      THROW_RUNTIME_ERROR(loc.str()+": character expected");
+    }
+
+    int Int() const {
+      if (ty == TY_INT) return i;
+      THROW_RUNTIME_ERROR(loc.str()+": integer expected");
+    }
+
+    float Float(bool cast = true)  const {
+      if (ty == TY_FLOAT) return f;
+      if (ty == TY_INT && cast) return (float)i;
+      THROW_RUNTIME_ERROR(loc.str()+": float expected");
+    }
+
+    std::string Identifier() const {
+      if (ty == TY_IDENTIFIER) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": identifier expected");
+    }
+
+    std::string String() const {
+      if (ty == TY_STRING) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": string expected");
+    }
+
+    std::string Symbol() const {
+      if (ty == TY_SYMBOL) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": symbol expected");
+    }
+
+    const ParseLocation& Location() const { return loc; }
+
+    friend bool operator==(const Token& a, const Token& b)
+    {
+      if (a.ty != b.ty) return false;
+      if (a.ty == TY_CHAR) return a.c == b.c;
+      if (a.ty == TY_INT) return a.i == b.i;
+      if (a.ty == TY_FLOAT) return a.f == b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str == b.str;
+      if (a.ty == TY_STRING) return a.str == b.str;
+      if (a.ty == TY_SYMBOL) return a.str == b.str;
+      return true;
+    }
+
+    friend bool operator!=(const Token& a, const Token& b) {
+      return !(a == b);
+    }
+
+    friend bool operator <( const Token& a, const Token& b ) {
+      if (a.ty != b.ty) return (int)a.ty < (int)b.ty;
+      if (a.ty == TY_CHAR) return a.c < b.c;
+      if (a.ty == TY_INT) return a.i < b.i;
+      if (a.ty == TY_FLOAT) return a.f < b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str < b.str;
+      if (a.ty == TY_STRING) return a.str < b.str;
+      if (a.ty == TY_SYMBOL) return a.str < b.str;
+      return false;
+    }
+
+    friend std::ostream& operator<<(std::ostream& cout, const Token& t)
+    {
+      if (t.ty == TY_EOF) return cout << "eof";
+      if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")";
+      if (t.ty == TY_INT) return cout << "Int(" << t.i << ")";
+      if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")";
+      if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")";
+      if (t.ty == TY_STRING) return cout << "String(" << t.str << ")";
+      if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")";
+      return cout << "unknown";
+    }
+
+  private:
+    Type ty;            //< the type of the token
+    union {
+      char c;           //< data for char tokens
+      int i;            //< data for int tokens
+      float f;          //< data for float tokens
+    };
+    std::string str;    //< data for string and identifier tokens
+    ParseLocation loc;  //< the location the token is from
+  };
+
+  /*! build full tokenizer that takes list of valid characters and keywords */
+  class TokenStream : public Stream<Token>
+  {
+  public:
+
+    /*! shorthands for common sets of characters */
+    static const std::string alpha;
+    static const std::string ALPHA;
+    static const std::string numbers;
+    static const std::string separators;
+    static const std::string stringChars;
+
+  public:
+    TokenStream(const Ref<Stream<int> >& cin,
+                const std::string& alpha, //< valid characters for identifiers
+                const std::string& seps,  //< characters that act as separators
+                const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols
+  public:
+    ParseLocation location() { return cin->loc(); }
+    Token next();
+    bool trySymbol(const std::string& symbol);
+
+  private:
+    void skipSeparators();
+    bool decDigits(std::string& str);
+    bool decDigits1(std::string& str);
+    bool trySymbols(Token& token, const ParseLocation& loc);
+    bool tryFloat(Token& token, const ParseLocation& loc);
+    bool tryInt(Token& token, const ParseLocation& loc);
+    bool tryString(Token& token, const ParseLocation& loc);
+    bool tryIdentifier(Token& token, const ParseLocation& loc);
+
+    Ref<Stream<int> > cin;
+    bool isSepMap[256];
+    bool isAlphaMap[256];
+    bool isStringCharMap[256];
+    std::vector<std::string> symbols;
+
+    /*! checks if a character is a separator */
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+
+    /*! checks if a character is a number */
+    __forceinline bool isDigit(unsigned int c) const {  return c >= '0' && c <= '9'; }
+
+    /*! checks if a character is valid inside a string */
+    __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; }
+
+    /*! checks if a character is legal for an identifier */
+    __forceinline bool isAlpha(unsigned int c) const {  return c<256 && isAlphaMap[c];  }
+    __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); }
+  };
+}
--- a/thirdparty/embree-aarch64/common/math/AVX2NEON.h
+++ b/thirdparty/embree-aarch64/common/math/AVX2NEON.h
@ -0,0 +1,986 @@
+#pragma once
+
+#include "SSE2NEON.h"
+
+
+#define AVX2NEON_ABI static inline  __attribute__((always_inline))
+
+
+struct __m256d;
+
+struct __m256 {
+    __m128 lo,hi;
+    __m256() {}
+};
+
+
+
+
+struct __m256i {
+    __m128i lo,hi;
+    explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
+    operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
+    __m256i() {}
+};
+ 
+
+
+
+struct __m256d {
+    float64x2_t lo,hi;
+    __m256d() {}
+    __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+    __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+};
+
+#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
+
+
+#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
+#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
+
+#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
+
+
+#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
+
+
+
+#define _mm_stream_load_si128 _mm_load_si128
+#define _mm256_stream_load_si256 _mm256_load_si256
+
+
+AVX2NEON_ABI
+__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
+{
+    __m128 res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
+{
+    __m128i res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
+{
+    int64x2_t y;
+    y[0] = *(int64_t *)mem_addr;
+    y[1] = 0;
+    return __m128i(y);
+}
+
+AVX2NEON_ABI
+int _mm_movemask_popcnt(__m128 a)
+{
+    return __builtin_popcount(_mm_movemask_ps(a));
+}
+
+AVX2NEON_ABI
+__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
+{
+    __m128 res;
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vnegq_f32(vfmaq_f32(c,a,b));
+}
+
+#define _mm_fnmsub_ss _mm_fnmsub_ps
+
+AVX2NEON_ABI
+__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmsq_f32(c,a,b);
+}
+
+#define _mm_fnmadd_ss _mm_fnmadd_ps
+
+
+AVX2NEON_ABI
+__m128 _mm_broadcast_ss (float const * mem_addr)
+{
+    return vdupq_n_f32(*mem_addr);
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmaq_f32(vnegq_f32(c),a,b);
+}
+
+#define _mm_fmsub_ss _mm_fmsub_ps
+#define _mm_fmadd_ps _mm_madd_ps
+#define _mm_fmadd_ss _mm_madd_ps
+
+
+
+template<int code>
+AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    v = 0;
+    v += (code & 0x10) ? a[0]*b[0] : 0;
+    v += (code & 0x20) ? a[1]*b[1] : 0;
+    v += (code & 0x40) ? a[2]*b[2] : 0;
+    v += (code & 0x80) ? a[3]*b[3] : 0;
+    float32x4_t res;
+    res[0] = (code & 0x1) ? v : 0;
+    res[1] = (code & 0x2) ? v : 0;
+    res[2] = (code & 0x4) ? v : 0;
+    res[3] = (code & 0x8) ? v : 0;
+    return res;
+}
+
+template<>
+inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    m[3] = 0;
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+template<>
+inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
+
+
+
+AVX2NEON_ABI
+__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_permutevar_ps (__m128 a, __m128i b)
+{
+    __m128 x;
+    for (int i=0;i<4;i++)
+    {
+        x[i] = a[b[i&3]];
+    }
+    return x;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_setzero_si256()
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_setzero_ps()
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(0.0f);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_undefined_si256()
+{
+    return _mm256_setzero_si256();
+}
+
+AVX2NEON_ABI
+__m256 _mm256_undefined_ps()
+{
+    return _mm256_setzero_ps();
+}
+
+CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t)
+CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i)
+CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128)
+CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128)
+CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t)
+CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i)
+
+
+
+
+AVX2NEON_ABI
+__m128 _mm256_castps256_ps128 (__m256 a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_castsi128_si256 (__m128i a)
+{
+    __m256i res;
+    res.lo = a ;
+    res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm256_castsi256_si128 (__m256i a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_castps128_ps256 (__m128 a)
+{
+    __m256 res;
+    res.lo = a;
+    res.hi = vdupq_n_f32(0);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ss (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(*mem_addr);
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
+{
+    __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi32 (int a)
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(a);
+    return res;
+}
+
+
+
+
+AVX2NEON_ABI
+int _mm256_movemask_ps(const __m256& v)
+{
+    return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_permute_ps (const __m256& a)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
+
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi64x (long long a)
+{
+    __m256i res;
+    int64x2_t t = vdupq_n_s64(a);
+    res.lo = res.hi = __m128i(t);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
+{
+    __m256 res;
+    __m128 tmp;
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+
+    
+    
+    res.lo = tmp;
+    imm8 >>= 4;
+    
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+    
+    res.hi = tmp;
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_moveldup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    res.lo[2] = res.lo[3] = a.lo[2];
+    res.hi[0] = res.hi[1] = a.hi[0];
+    res.hi[2] = res.hi[3] = a.hi[2];
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256 _mm256_movehdup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[1];
+    res.lo[2] = res.lo[3] = a.lo[3];
+    res.hi[0] = res.hi[1] = a.hi[1];
+    res.hi[2] = res.hi[3] = a.hi[3];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
+{
+    __m256 res = a;
+    if (imm8 & 1) res.hi = b;
+    else res.lo = b;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+
+AVX2NEON_ABI
+__m256d _mm256_movedup_pd (__m256d a)
+{
+    __m256d res;
+    res.hi = a.hi;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_abs_epi32(__m256i a)
+{
+   __m256i res;
+   res.lo = vabsq_s32(a.lo);
+   res.hi = vabsq_s32(a.hi);
+   return res;
+}
+
+UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
+UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
+UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
+BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
+BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
+
+BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
+BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
+BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
+BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
+
+BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
+BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
+
+BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
+BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
+BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
+BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
+
+BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
+BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
+BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
+BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
+
+BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
+
+
+
+BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
+BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
+BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
+
+
+BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
+BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
+TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
+
+
+TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
+
+
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
+BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
+BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvtps_epi32(a.lo);
+    res.hi = _mm_cvtps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvttps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvttps_epi32(a.lo);
+    res.hi = _mm_cvttps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256 _mm256_loadu_ps (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = *(__m128 *)(mem_addr + 0);
+    res.hi = *(__m128 *)(mem_addr + 4);
+    return res;
+}
+#define _mm256_load_ps _mm256_loadu_ps
+
+
+AVX2NEON_ABI
+int _mm256_testz_ps (const __m256& a, const __m256& b)
+{
+    __m256 t = a;
+    if (&a != &b)
+        t = _mm256_and_ps(a,b);
+
+    __m128i l  = vshrq_n_s32(__m128i(t.lo),31);
+    __m128i h  = vshrq_n_s32(__m128i(t.hi),31);
+    return vaddvq_s32(vaddq_s32(l,h)) == 0;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
+{
+    __m256i res;
+    int64x2_t t0 = {e0,e1};
+    int64x2_t t1 = {e2,e3};
+    res.lo = __m128i(t0);
+    res.hi = __m128i(t1);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_setzero_pd ()
+{
+    __m256d res;
+    res.lo = res.hi = vdupq_n_f64(0);
+    return res;
+}
+
+AVX2NEON_ABI
+int _mm256_movemask_pd (__m256d a)
+{
+    int res = 0;
+    uint64x2_t x;
+    x = uint64x2_t(a.lo);
+    res |= (x[0] >> 63) ? 1 : 0;
+    res |= (x[0] >> 63) ? 2 : 0;
+    x = uint64x2_t(a.hi);
+    res |= (x[0] >> 63) ? 4 : 0;
+    res |= (x[0] >> 63) ? 8 : 0;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo)));
+    res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_pd (__m256d a, __m256d b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_f64(a.lo,b.lo));
+    res.hi = __m128i(vceqq_f64(a.hi,b.hi));
+    return res;
+}
+
+
+AVX2NEON_ABI
+int _mm256_testz_pd (const __m256d& a, const __m256d& b)
+{
+    __m256d t = a;
+
+    if (&a != &b)
+        t = _mm256_and_pd(a,b);
+
+    return _mm256_movemask_pd(t) == 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
+{
+    __m256d res;
+    uint64x2_t t = uint64x2_t(mask.lo);
+    res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0];
+    res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1];
+    t = uint64x2_t(mask.hi);
+    res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0];
+    res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1];
+    return res;
+}
+
+template<int imm8>
+__m256 __mm256_dp_ps (__m256 a, __m256 b)
+{
+    __m256 res;
+    res.lo = _mm_dp_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_dp_ps(a.hi,b.hi,imm8);
+    return res;
+}
+
+#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
+
+AVX2NEON_ABI
+double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
+{
+    switch (imm8 & 3) {
+        case 0:
+            return a.lo[0];
+        case 1:
+            return a.lo[1];
+        case 2:
+            return a.hi[0];
+        case 3:
+            return a.hi[1];
+    }
+    __builtin_unreachable();
+    return 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
+{
+    __m256d res;
+    res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
+    res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
+    res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
+    res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
+{
+    return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
+{
+    __m256i res;
+    res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
+    res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
+    return res;
+}
+
+#define _mm256_load_si256 _mm256_loadu_si256
+
+AVX2NEON_ABI
+void _mm256_storeu_ps (float * mem_addr, __m256 a)
+{
+    *(__m128 *)(mem_addr + 0) = a.lo;
+    *(__m128 *)(mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_ps _mm256_storeu_ps
+#define _mm256_stream_ps _mm256_storeu_ps
+
+
+AVX2NEON_ABI
+void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
+{
+    *(__m128i *)((int *)mem_addr + 0) = a.lo;
+    *(__m128i *)((int *)mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_si256 _mm256_storeu_si256
+
+
+
+AVX2NEON_ABI
+__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
+{
+    __m256 res;
+    res.lo = _mm_maskload_ps(mem_addr,mask.lo);
+    res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu8_epi32 (__m128i a)
+{
+    __m256i res;
+    uint8x16_t x = uint8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi8_epi32 (__m128i a)
+{
+    __m256i res;
+    int8x16_t x = int8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu16_epi32 (__m128i a)
+{
+    __m256i res;
+    uint16x8_t x = uint16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi16_epi32 (__m128i a)
+{
+    __m256i res;
+    int16x8_t x = int16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
+{
+    _mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
+    _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
+}
+
+AVX2NEON_ABI
+__m256i _mm256_slli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_slli_epi32(a.lo,imm8);
+    res.hi = _mm_slli_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srli_epi32(a.lo,imm8);
+    res.hi = _mm_srli_epi32(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srai_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srai_epi32(a.lo,imm8);
+    res.hi = _mm_srai_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,count.lo);
+    res.hi = vshlq_s32(a.hi,count.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
+    res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
+    res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
+{
+    return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set1_ps(float x)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(x);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
+{
+    __m256 res;
+    res.lo = _mm_set_ps(e3,e2,e1,e0);
+    res.hi = _mm_set_ps(e7,e6,e5,e4);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = *mem_addr;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_cvtepi32_ps (__m256i a)
+{
+    __m256 res;
+    res.lo = _mm_cvtepi32_ps(a.lo);
+    res.hi = _mm_cvtepi32_ps(a.hi);
+    return res;
+}
+AVX2NEON_ABI
+void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i];
+        if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i];
+    }
+}
+
+AVX2NEON_ABI
+__m256d _mm256_andnot_pd (__m256d a, __m256d b)
+{
+    __m256d res;
+    res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
+    res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
+{
+    __m256 res;
+    res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
+{
+    __m256i res;
+    res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
+{
+    __m256i res;
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
+{
+    __m256i res = _mm256_setzero_si256();
+    for (int i=0;i<4;i++)
+    {
+        if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    
+    return res;
+
+}
+
+
--- a/thirdparty/embree-aarch64/common/math/SSE2NEON.h
+++ b/thirdparty/embree-aarch64/common/math/SSE2NEON.h
--- a/thirdparty/embree-aarch64/common/math/affinespace.h
+++ b/thirdparty/embree-aarch64/common/math/affinespace.h
@ -0,0 +1,361 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linearspace2.h"
+#include "linearspace3.h"
+#include "quaternion.h"
+#include "bbox.h"
+#include "vec4.h"
+
+namespace embree
+{
+  #define VectorT typename L::Vector
+  #define ScalarT typename L::Vector::Scalar
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Affine Space
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L>
+    struct AffineSpaceT
+    {
+      L l;           /*< linear part of affine space */
+      VectorT p;     /*< affine part of affine space */
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constructors, Assignment, Cast, Copy Operations
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT           ( )                           { }
+      __forceinline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+      __forceinline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
+      __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+      __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+      __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+      template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constants
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+      __forceinline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
+
+      /*! return matrix for scaling */
+      static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+      /*! return matrix for translation */
+      static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+      /*! return matrix for rotation, only in 2D */
+      static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+      /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+      static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+      /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+      static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
+
+      /*! return matrix for looking at given point, only in 3D */
+      static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+        VectorT Z = normalize(point-eye);
+        VectorT U = normalize(cross(up,Z));
+        VectorT V = normalize(cross(Z,U));
+        return AffineSpaceT(L(U,V,Z),eye);
+      }
+
+    };
+  
+  // template specialization to get correct identity matrix for type AffineSpace3fa
+  template<>
+    __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy )  : l(one),  p(0.f, 0.f, 0.f, 1.f) {}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+  template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+  template<typename L> __forceinline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
+
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
+
+  template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+  template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+  template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+  __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) 
+  { 
+    BBox3fa dst = empty;
+    const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+    const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+    const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+    const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+    const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+    const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+    const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+    const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+    return dst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+  template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
+    return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
+    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Template Instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
+  typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
+  typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
+  typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
+  typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
+  typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
+
+  template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>>  AffineSpace3vf4;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>>  AffineSpace3vf8;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
+
+  template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>>  AffineSpace3vfa4;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>>  AffineSpace3vfa8;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T, typename R>
+  __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
+                                     const AffineSpaceT<T>& M1,
+                                     const R& t)
+  {
+    return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
+  }
+
+  // slerp interprets the 16 floats of the matrix M = D * R * S as components of
+  // three matrizes (D, R, S) that are interpolated individually.
+  template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
+  slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
+        const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
+        const T& t)
+  {
+    QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    QuaternionT<T> q = slerp(q0, q1, t);
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
+    AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
+    return D * R * S;
+  }
+
+  // this is a specialized version for Vec3fa because that does
+  // not play along nicely with the other templated Vec3/Vec4 types
+  __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
+                                     const AffineSpace3ff& M1,
+                                     const float& t)
+  {
+    Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    Quaternion3f q = slerp(q0, q1, t);
+
+    AffineSpace3fa S = lerp(M0, M1, t);
+    AffineSpace3fa D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * S;
+  }
+  
+  __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
+  {
+    // compute affine transform from quaternion decomposition
+    Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    AffineSpace3fa M = qd;
+    AffineSpace3fa D(one);
+    D.p.x = M.l.vx.y;
+    D.p.y = M.l.vx.z;
+    D.p.z = M.l.vy.z;
+    M.l.vx.y = 0;
+    M.l.vx.z = 0;
+    M.l.vy.z = 0;
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * M;
+  }
+  
+  __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
+  {
+    q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    S = qd;
+    T.x = qd.l.vx.y;
+    T.y = qd.l.vx.z;
+    T.z = qd.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+  }
+
+  __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
+  {
+    AffineSpace3ff M = S;
+    M.l.vx.w = q.i;
+    M.l.vy.w = q.j;
+    M.l.vz.w = q.k;
+    M.p.w    = q.r;
+    M.l.vx.y = T.x;
+    M.l.vx.z = T.y;
+    M.l.vy.z = T.z;
+    return M;
+  }
+
+  struct __aligned(16) QuaternionDecomposition
+  {
+    float scale_x = 1.f;
+    float scale_y = 1.f;
+    float scale_z = 1.f;
+    float skew_xy = 0.f;
+    float skew_xz = 0.f;
+    float skew_yz = 0.f;
+    float shift_x = 0.f;
+    float shift_y = 0.f;
+    float shift_z = 0.f;
+    float quaternion_r = 1.f;
+    float quaternion_i = 0.f;
+    float quaternion_j = 0.f;
+    float quaternion_k = 0.f;
+    float translation_x = 0.f;
+    float translation_y = 0.f;
+    float translation_z = 0.f;
+  };
+
+  __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
+  {
+    QuaternionDecomposition qd;
+    qd.scale_x       = M.l.vx.x;
+    qd.scale_y       = M.l.vy.y;
+    qd.scale_z       = M.l.vz.z;
+    qd.shift_x       = M.p.x;
+    qd.shift_y       = M.p.y;
+    qd.shift_z       = M.p.z;
+    qd.translation_x = M.l.vx.y;
+    qd.translation_y = M.l.vx.z;
+    qd.translation_z = M.l.vy.z;
+    qd.skew_xy       = M.l.vy.x;
+    qd.skew_xz       = M.l.vz.x;
+    qd.skew_yz       = M.l.vz.y;
+    qd.quaternion_r  = M.p.w;
+    qd.quaternion_i  = M.l.vx.w;
+    qd.quaternion_j  = M.l.vy.w;
+    qd.quaternion_k  = M.l.vz.w;
+    return qd;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /*
+   * ! Template Specialization for 2D: return matrix for rotation around point
+   * (rotation around arbitrarty vector is not meaningful in 2D)
+   */
+  template<> __forceinline
+  AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
+    return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Similarity Transform
+  //
+  // checks, if M is a similarity transformation, i.e if there exists a factor D
+  // such that for all x,y: distance(Mx, My) = D * distance(x, y)
+  ////////////////////////////////////////////////////////////////////////////////
+  __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
+  {
+    if (D) *D = 0.f;
+    if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
+    if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
+    if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
+
+    const float D_x = dot(M.l.vx, M.l.vx);
+    const float D_y = dot(M.l.vy, M.l.vy);
+    const float D_z = dot(M.l.vz, M.l.vz);
+
+    if (abs(D_x - D_y) > 1e-5f ||
+        abs(D_x - D_z) > 1e-5f ||
+        abs(D_y - D_z) > 1e-5f)
+      return false;
+
+    if (D) *D = sqrtf(D_x);
+    return true;
+  }
+
+  __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
+  {
+    Vec3fa::storeu(&ptr->l.vx, source.l.vx);
+    Vec3fa::storeu(&ptr->l.vy, source.l.vy);
+    Vec3fa::storeu(&ptr->l.vz, source.l.vz);
+    Vec3fa::storeu(&ptr->p, source.p);
+  }
+
+  __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
+  {
+    AffineSpace3fa space;
+    space.l.vx = Vec3fa::loadu(&ptr->l.vx);
+    space.l.vy = Vec3fa::loadu(&ptr->l.vy);
+    space.l.vz = Vec3fa::loadu(&ptr->l.vz);
+    space.p    = Vec3fa::loadu(&ptr->p);
+    return space;
+  }
+
+  #undef VectorT
+  #undef ScalarT
+}
--- a/thirdparty/embree-aarch64/common/math/bbox.h
+++ b/thirdparty/embree-aarch64/common/math/bbox.h
@ -0,0 +1,331 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+
+namespace embree
+{
+  namespace internal {
+
+    template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
+    template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
+    template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
+
+  } // namespace internal
+  template<typename T>
+  struct BBox
+  {
+    T lower, upper;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox           ( )                   { }
+    template<typename T1>
+    __forceinline BBox           ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
+    __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+    __forceinline BBox ( const T& v                     ) : lower(v), upper(v) {}
+    __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Extending Bounds
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+    __forceinline const BBox& extend(const T   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+
+    /*! tests if box is empty */
+    __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
+
+    /*! computes the size of the box */
+    __forceinline T size() const { return upper - lower; }
+
+    /*! computes the center of the box */
+    __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
+
+    /*! computes twice the center of the box */
+    __forceinline T center2() const { return lower+upper; }
+
+    /*! merges two boxes */
+    __forceinline static const BBox merge (const BBox& a, const BBox& b) {
+      return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
+    }
+
+     /*! enlarge box by some scaling factor */
+    __forceinline BBox enlarge_by(const float a) const {
+      return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( TrueTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
+  };
+
+  template<> __forceinline bool BBox<float>::empty() const {
+    return lower > upper;
+  }
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline bool BBox<Vec3fa>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+  template<> __forceinline bool BBox<Vec3fx>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+#endif
+
+  /*! tests if box is finite */
+  __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
+  }
+
+  /*! tests if box is finite and non-empty*/
+  __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
+  }
+  
+  /*! tests if box has finite entries */
+  __forceinline bool is_finite( const BBox<Vec3fa>& b) {
+    return is_finite(b.lower) && is_finite(b.upper);
+  }
+
+  /*! test if point contained in box */
+  __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
+
+  /*! computes the center of the box */
+  template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
+  template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume    ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
+  __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume( const BBox<Vec3f>& b )  { return reduce_mul(b.size()); }
+
+  /*! computes the surface area of a bounding box */
+  template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
+
+  template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
+  template<typename T> __forceinline const T     area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
+
+  template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
+
+  template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
+    return halfArea(box);
+  }
+
+  /*! merges bounding boxes and points */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const       T& b ) { return BBox<T>(min(a.lower, b    ), max(a.upper, b    )); }
+  template<typename T> __forceinline const BBox<T> merge( const       T& a, const BBox<T>& b ) { return BBox<T>(min(a    , b.lower), max(a    , b.upper)); }
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
+
+  /*! Merges three boxes. */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
+
+  /*! Merges four boxes. */
+  template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
+    return merge(merge(a,b),merge(c,d));
+  }
+
+  /*! Comparison Operators */
+  template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
+  template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
+
+  /*! scaling */
+  template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+  template<typename T> __forceinline BBox<T> operator *( const     T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+
+  /*! translations */
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower+b      ,a.upper+b      ); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower-b      ,a.upper-b      ); }
+
+  /*! extension */
+  template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
+
+  /*! intersect bounding boxes */
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+  /*! subtract bounds from each other */
+  template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
+  {
+    c.lower = a.lower;
+    c.upper = min(a.upper,b.lower);
+    d.lower = max(a.lower,b.upper);
+    d.upper = a.upper;
+  }
+
+  /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const       T& b ) { return disjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool disjoint( const       T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
+
+  /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const       T& b ) { return conjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool conjoint( const       T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
+
+  /*! subset relation */
+  template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
+  { 
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
+    return true; 
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+  
+  /*! blending */
+  template<typename T>
+    __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
+    return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
+  }
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
+    return cout << "[" << box.lower << "; " << box.upper << "]";
+  }
+
+  /*! default template instantiations */
+  typedef BBox<float> BBox1f;
+  typedef BBox<Vec2f> BBox2f;
+  typedef BBox<Vec2fa> BBox2fa;
+  typedef BBox<Vec3f> BBox3f;
+  typedef BBox<Vec3fa> BBox3fa;
+  typedef BBox<Vec3fx> BBox3fx;
+  typedef BBox<Vec3ff> BBox3ff;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined (__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<int N>
+    __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat4>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat8>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              (vfloat4&)bounds[4].lower,
+              (vfloat4&)bounds[5].lower,
+              (vfloat4&)bounds[6].lower,
+              (vfloat4&)bounds[7].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              (vfloat4&)bounds[4].upper,
+              (vfloat4&)bounds[5].upper,
+              (vfloat4&)bounds[6].upper,
+              (vfloat4&)bounds[7].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+#endif
+  
+  template<int N>
+    __forceinline BBox3fa merge(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
+                             min(bounds[2].lower,bounds[3].lower));
+    const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
+                             max(bounds[2].upper,bounds[3].upper));
+    return BBox3fa(lower,upper);
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
+                             min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
+    const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
+                             max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
+    return BBox3fa(lower,upper);
+  }
+#endif
+}
+
--- a/thirdparty/embree-aarch64/common/math/col3.h
+++ b/thirdparty/embree-aarch64/common/math/col3.h
@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col3
+  {
+    T r, g, b;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3           ( )                   { }
+    __forceinline Col3           ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
+    __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
+
+    __forceinline explicit Col3 (const T& v)                         : r(v), g(v), b(v) {}
+    __forceinline          Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)    {}
+    __forceinline Col3 (OneTy)    : r(one)    , g(one)    , b(one)     {}
+    __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
+    __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col3<uint8_t      > Col3uc;
+  typedef Col3<float        > Col3f;
+}
--- a/thirdparty/embree-aarch64/common/math/col4.h
+++ b/thirdparty/embree-aarch64/common/math/col4.h
@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col4
+  {
+    T r, g, b, a;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4           ( )                   { }
+    __forceinline Col4           ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
+    __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
+
+    __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)   , a(zero) {}
+    __forceinline Col4 (OneTy)    : r(one)    , g(one)    , b(one)    , a(one) {}
+    __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
+    __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col4<uint8_t      > Col4uc;
+  typedef Col4<float        > Col4f;
+}
--- a/thirdparty/embree-aarch64/common/math/color.h
+++ b/thirdparty/embree-aarch64/common/math/color.h
@ -0,0 +1,257 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b,a; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    __forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col3f&  other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
+    __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col4f&  other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
+
+    __forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+    __forceinline void set(Col3uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+      d.a = (uint8_t)(s[3]); 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color4( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    __forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)                               : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color  (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
+
+    __forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    __forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+      d.a = 255; 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline const Color abs  ( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline const Color rcp  ( const Color& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 reciprocal = _mm_rcp_ps(a.m128);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    return (const Color)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Color r = _mm_rcp14_ps(a.m128);
+#else
+    const Color r = _mm_rcp_ps(a.m128);
+#endif
+    return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
+  }
+  __forceinline const Color rsqrt( const Color& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 r = _mm_rsqrt_ps(a.m128);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    return r;
+#else
+      
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+      
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
+  }
+  __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
--- a/thirdparty/embree-aarch64/common/math/constants.cpp
+++ b/thirdparty/embree-aarch64/common/math/constants.cpp
@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include "constants.h"
+
+namespace embree
+{
+  TrueTy True;
+  FalseTy False;
+  ZeroTy zero;
+  OneTy one;
+  NegInfTy neg_inf;
+  PosInfTy inf;
+  PosInfTy pos_inf;
+  NaNTy nan;
+  UlpTy ulp;
+  PiTy pi;
+  OneOverPiTy one_over_pi;
+  TwoPiTy two_pi;
+  OneOverTwoPiTy one_over_two_pi;
+  FourPiTy four_pi;
+  OneOverFourPiTy one_over_four_pi;
+  StepTy step;
+  ReverseStepTy reverse_step;
+  EmptyTy empty;
+  UndefinedTy undefined;
+
+#if defined(__aarch64__)
+const uint32x4_t movemask_mask = { 1, 2, 4, 8 };
+const uint32x4_t vzero = { 0, 0, 0, 0 };
+const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
+const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
+const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15};
+const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
+const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f };
+const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f };
+const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY };
+const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
+#endif
+
+}
--- a/thirdparty/embree-aarch64/common/math/constants.h
+++ b/thirdparty/embree-aarch64/common/math/constants.h
@ -0,0 +1,239 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+#include <limits>
+
+#define _USE_MATH_DEFINES
+#include <math.h> // using cmath causes issues under Windows
+#include <cfloat>
+#include <climits>
+
+// Math constants may not be defined in libcxx + mingw + strict C++ standard
+#if defined(__MINGW32__)
+
+// TODO(LTE): use constexpr
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_1_PI
+#define M_1_PI 0.31830988618379067154
+#endif
+
+#endif // __MINGW32__
+
+namespace embree
+{
+  static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
+  static MAYBE_UNUSED const float min_rcp_input = 1E-18f;  // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
+
+  /* we consider floating point numbers in that range as valid input numbers */
+  static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
+
+  struct TrueTy {
+    __forceinline operator bool( ) const { return true; }
+  };
+
+  extern MAYBE_UNUSED TrueTy True;
+
+  struct FalseTy {
+    __forceinline operator bool( ) const { return false; }
+  };
+
+  extern MAYBE_UNUSED FalseTy False;
+  
+  struct ZeroTy
+  {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          int8_t     ( ) const { return 0; }
+    __forceinline operator uint8_t     ( ) const { return 0; }
+  }; 
+
+  extern MAYBE_UNUSED ZeroTy zero;
+
+  struct OneTy
+  {
+    __forceinline operator          double   ( ) const { return 1; }
+    __forceinline operator          float    ( ) const { return 1; }
+    __forceinline operator          long long( ) const { return 1; }
+    __forceinline operator unsigned long long( ) const { return 1; }
+    __forceinline operator          long     ( ) const { return 1; }
+    __forceinline operator unsigned long     ( ) const { return 1; }
+    __forceinline operator          int      ( ) const { return 1; }
+    __forceinline operator unsigned int      ( ) const { return 1; }
+    __forceinline operator          short    ( ) const { return 1; }
+    __forceinline operator unsigned short    ( ) const { return 1; }
+    __forceinline operator          int8_t     ( ) const { return 1; }
+    __forceinline operator uint8_t     ( ) const { return 1; }
+  };
+
+  extern MAYBE_UNUSED OneTy one;
+
+  struct NegInfTy
+  {
+    __forceinline operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::min(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::min(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::min(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::min(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::min(); }
+
+  };
+
+  extern MAYBE_UNUSED NegInfTy neg_inf;
+
+  struct PosInfTy
+  {
+    __forceinline operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::max(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::max(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::max(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::max(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::max(); }
+  };
+
+  extern MAYBE_UNUSED PosInfTy inf;
+  extern MAYBE_UNUSED PosInfTy pos_inf;
+
+  struct NaNTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+  };
+
+  extern MAYBE_UNUSED NaNTy nan;
+
+  struct UlpTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+  };
+
+  extern MAYBE_UNUSED UlpTy ulp;
+
+  struct PiTy
+  {
+    __forceinline operator double( ) const { return double(M_PI); }
+    __forceinline operator float ( ) const { return float(M_PI); }
+  };
+
+  extern MAYBE_UNUSED PiTy pi;
+
+  struct OneOverPiTy
+  {
+    __forceinline operator double( ) const { return double(M_1_PI); }
+    __forceinline operator float ( ) const { return float(M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+
+  struct TwoPiTy
+  {
+    __forceinline operator double( ) const { return double(2.0*M_PI); }
+    __forceinline operator float ( ) const { return float(2.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED TwoPiTy two_pi;
+
+  struct OneOverTwoPiTy
+  {
+    __forceinline operator double( ) const { return double(0.5*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+
+  struct FourPiTy
+  {
+    __forceinline operator double( ) const { return double(4.0*M_PI); } 
+    __forceinline operator float ( ) const { return float(4.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED FourPiTy four_pi;
+
+  struct OneOverFourPiTy
+  {
+    __forceinline operator double( ) const { return double(0.25*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+
+  struct StepTy {
+  };
+
+  extern MAYBE_UNUSED StepTy step;
+
+  struct ReverseStepTy {
+  };
+
+  extern MAYBE_UNUSED ReverseStepTy reverse_step;
+
+  struct EmptyTy {
+  };
+
+  extern MAYBE_UNUSED EmptyTy empty;
+
+  struct FullTy {
+  };
+
+  extern MAYBE_UNUSED FullTy full;
+
+  struct UndefinedTy {
+  };
+
+  extern MAYBE_UNUSED UndefinedTy undefined;
+    
+#if defined(__aarch64__)
+  extern const uint32x4_t movemask_mask;
+  extern const uint32x4_t vzero;
+  extern const uint32x4_t v0x80000000;
+  extern const uint32x4_t v0x7fffffff;
+  extern const uint32x4_t v000F;
+  extern const uint32x4_t v00F0;
+  extern const uint32x4_t v00FF;
+  extern const uint32x4_t v0F00;
+  extern const uint32x4_t v0F0F;
+  extern const uint32x4_t v0FF0;
+  extern const uint32x4_t v0FFF;
+  extern const uint32x4_t vF000;
+  extern const uint32x4_t vF00F;
+  extern const uint32x4_t vF0F0;
+  extern const uint32x4_t vF0FF;
+  extern const uint32x4_t vFF00;
+  extern const uint32x4_t vFF0F;
+  extern const uint32x4_t vFFF0;
+  extern const uint32x4_t vFFFF;
+  extern const uint8x16_t v0022;
+  extern const uint8x16_t v1133;
+  extern const uint8x16_t v0101;
+  extern const float32x4_t vOne;
+  extern const float32x4_t vmOne;
+  extern const float32x4_t vInf;
+  extern const float32x4_t vmInf;
+#endif
+}
--- a/thirdparty/embree-aarch64/common/math/interval.h
+++ b/thirdparty/embree-aarch64/common/math/interval.h
@ -0,0 +1,161 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+#include "bbox.h"
+
+namespace embree
+{
+  template<typename V>
+    struct Interval
+    {
+      V lower, upper;
+      
+      __forceinline Interval() {}
+      __forceinline Interval           ( const Interval& other ) { lower = other.lower; upper = other.upper; }
+      __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+      __forceinline Interval(const V& a) : lower(a), upper(a) {}
+      __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
+      __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
+          
+      /*! tests if box is empty */
+      //__forceinline bool empty() const { return lower > upper; }
+      
+      /*! computes the size of the interval */
+      __forceinline V size() const { return upper - lower; }
+      
+      __forceinline V center() const { return 0.5f*(lower+upper); }
+      
+      __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+      __forceinline const Interval& extend(const V   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+      
+      __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
+        return Interval(a.lower+b.lower,a.upper+b.upper);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
+        return Interval(a.lower-b.upper,a.upper-b.lower);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const V& b ) {
+        return Interval(a.lower-b,a.upper-b);
+      }
+      
+      __forceinline friend Interval operator *( const Interval& a, const Interval& b )
+      {
+        const V ll = a.lower*b.lower;
+        const V lu = a.lower*b.upper;
+        const V ul = a.upper*b.lower;
+        const V uu = a.upper*b.upper;
+        return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b) {
+        return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
+        return merge(merge(a,b),c);
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
+        return merge(merge(a,b),merge(c,d));
+      }
+      
+      /*! intersect bounding boxes */
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }       
+      
+      friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
+        return cout << "[" << a.lower << ", " << a.upper << "]";
+      }
+      
+      ////////////////////////////////////////////////////////////////////////////////
+      /// Constants
+      ////////////////////////////////////////////////////////////////////////////////
+      
+      __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+      __forceinline Interval( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    };
+
+  __forceinline bool isEmpty(const Interval<float>& v) { 
+    return v.lower > v.upper;
+  }
+
+  __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
+    return v.lower > v.upper;
+  }
+  
+  /*! subset relation */
+  template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { 
+    return (a.lower > b.lower) && (a.upper < b.upper);
+  }
+
+  template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { 
+    return subset(a.x,b.x) && subset(a.y,b.y);
+  }
+
+  template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+    return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
+  {
+    float eps = 1E-4f;
+    bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
+    bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
+    return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
+  }
+  
+  typedef Interval<float> Interval1f;
+  typedef Vec2<Interval<float>> Interval2f;
+  typedef Vec3<Interval<float>> Interval3f;
+
+inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
+
+inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
+
+#define TWO_PI (2.0*M_PI)
+inline Interval1f sin(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float sinLower = sin(interval.lower);
+  float sinUpper = sin(interval.upper);
+  if (sinLower > sinUpper) swap(sinLower, sinUpper);
+  if (interval.lower <       M_PI / 2.0 && interval.upper >       M_PI / 2.0) sinUpper =  1.0;
+  if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
+  return Interval1f(sinLower, sinUpper);
+}
+
+inline Interval1f cos(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float cosLower = cos(interval.lower);
+  float cosUpper = cos(interval.upper);
+  if (cosLower > cosUpper) swap(cosLower, cosUpper);
+  if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
+  return Interval1f(cosLower, cosUpper);
+}
+#undef TWO_PI
+}
--- a/thirdparty/embree-aarch64/common/math/lbbox.h
+++ b/thirdparty/embree-aarch64/common/math/lbbox.h
@ -0,0 +1,289 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "range.h"
+
+namespace embree
+{
+  template<typename T>
+    __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
+  {
+    const float rcp_dt_size = float(1.0f)/dt.size();
+    const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
+    const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
+    return std::make_pair(g0,g1);
+  }
+
+  template<typename T>
+  struct LBBox
+  {
+  public:
+    __forceinline LBBox () {}
+
+    template<typename T1>
+    __forceinline LBBox ( const LBBox<T1>& other )
+    : bounds0(other.bounds0), bounds1(other.bounds1) {} 
+
+    __forceinline LBBox& operator= ( const LBBox& other ) { 
+      bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; 
+    }
+
+    __forceinline LBBox (EmptyTy) 
+      : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
+    
+    __forceinline explicit LBBox ( const BBox<T>& bounds) 
+      : bounds0(bounds), bounds1(bounds) { }
+    
+    __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) 
+      : bounds0(bounds0), bounds1(bounds1) { }
+
+    LBBox ( const avector<BBox<T>>& bounds ) 
+    {
+      assert(bounds.size());
+      BBox<T> b0 = bounds.front();
+      BBox<T> b1 = bounds.back();
+      for (size_t i=1; i<bounds.size()-1; i++) {
+        const float f = float(i)/float(bounds.size()-1);
+        const BBox<T> bt = lerp(b0,b1,f);
+        const T dlower = min(bounds[i].lower-bt.lower,T(zero));
+        const T dupper = max(bounds[i].upper-bt.upper,T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
+    {
+      const float lower = time_range.lower*numTimeSegments;
+      const float upper = time_range.upper*numTimeSegments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const int ilower = (int)ilowerf;
+      const int iupper = (int)iupperf;
+
+      const BBox<T> blower0 = bounds(ilower);
+      const BBox<T> bupper1 = bounds(iupper);
+
+      if (iupper-ilower == 1) {
+        bounds0 = lerp(blower0, bupper1, lower-ilowerf);
+        bounds1 = lerp(bupper1, blower0, iupperf-upper);
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilower+1);
+      const BBox<T> bupper0 = bounds(iupper-1);
+      BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
+      BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
+
+      for (int i = ilower+1; i < iupper; i++)
+      {
+        const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
+    {
+      /* normalize global time_range_in to local geom_time_range */
+      const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
+                              (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
+        
+      const float lower = time_range.lower*geom_time_segments;
+      const float upper = time_range.upper*geom_time_segments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const float ilowerfc = max(0.0f,ilowerf);
+      const float iupperfc = min(iupperf,geom_time_segments);
+      const int   ilowerc = (int)ilowerfc;
+      const int   iupperc = (int)iupperfc;
+      assert(iupperc-ilowerc > 0);
+
+      /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
+      const int ilower_iter = max(-1,(int)ilowerf);
+      const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
+        
+      const BBox<T> blower0 = bounds(ilowerc);
+      const BBox<T> bupper1 = bounds(iupperc);
+      if (iupper_iter-ilower_iter == 1) {
+        bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
+        bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilowerc+1);
+      const BBox<T> bupper0 = bounds(iupperc-1);
+      BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
+      BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
+
+      for (int i = ilower_iter+1; i < iupper_iter; i++)
+      {
+        const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
+    {
+      const int ilower = time_range.begin();
+      const int iupper = time_range.end();
+
+      BBox<T> b0 = bounds(ilower);
+      BBox<T> b1 = bounds(iupper);
+
+      if (iupper-ilower == 1)
+      {
+        bounds0 = b0;
+        bounds1 = b1;
+        return;
+      }
+  
+      for (int i = ilower+1; i<iupper; i++)
+      {
+        const float f = float(i - time_range.begin()) / float(time_range.size());
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+  public:
+
+    __forceinline bool empty() const {
+      return bounds().empty();
+    }
+
+    __forceinline BBox<T> bounds () const {
+      return merge(bounds0,bounds1);
+    }
+
+    __forceinline BBox<T> interpolate( const float t ) const {
+      return lerp(bounds0,bounds1,t);
+    }
+
+    __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
+      return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
+    }
+
+    __forceinline void extend( const LBBox& other ) {
+      bounds0.extend(other.bounds0);
+      bounds1.extend(other.bounds1);
+    }
+
+    __forceinline float expectedHalfArea() const;
+
+    __forceinline float expectedHalfArea(const BBox1f& dt) const {
+      return interpolate(dt).expectedHalfArea();
+    }
+
+    __forceinline float expectedApproxHalfArea() const {
+      return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
+    }
+
+    /* calculates bounds for [0,1] time range from bounds in dt time range */
+    __forceinline LBBox global(const BBox1f& dt) const 
+    {
+      const float rcp_dt_size = 1.0f/dt.size();
+      const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
+      const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
+      return LBBox(b0,b1);
+    }
+
+    /*! Comparison Operators */
+    //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
+      return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
+    }
+
+  public:
+    BBox<T> bounds0, bounds1;
+  };
+
+  /*! tests if box is finite */
+  template<typename T>
+    __forceinline bool isvalid( const LBBox<T>& v ) {
+    return isvalid(v.bounds0) && isvalid(v.bounds1);
+  }
+
+  template<typename T>
+    __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
+    return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
+  }
+  
+  template<typename T>
+    __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
+  {
+    const T da = a1-a0;
+    const T db = b1-b0;
+    return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
+  }
+  
+  template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const 
+  {
+    const Vec3fa d0 = bounds0.size();
+    const Vec3fa d1 = bounds1.size();
+    return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
+                                   Vec3fa(d1.x,d1.y,d1.z),
+                                   Vec3fa(d0.y,d0.z,d0.x),
+                                   Vec3fa(d1.y,d1.z,d1.x)));
+  }
+
+  template<typename T>
+  __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
+    return box.expectedApproxHalfArea(); 
+  }
+
+  template<typename T>
+  __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
+    return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
+  }
+
+   /*! subset relation */
+  template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
+    return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
+  }
+
+  /*! default template instantiations */
+  typedef LBBox<float> LBBox1f;
+  typedef LBBox<Vec2f> LBBox2f;
+  typedef LBBox<Vec3f> LBBox3f;
+  typedef LBBox<Vec3fa> LBBox3fa;
+  typedef LBBox<Vec3fx> LBBox3fx;
+}
--- a/thirdparty/embree-aarch64/common/math/linearspace2.h
+++ b/thirdparty/embree-aarch64/common/math/linearspace2.h
@ -0,0 +1,148 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2D Linear Transform (2x2 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace2
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace2           ( ) {}
+    __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+    __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+    template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
+      : vx(vx), vy(vy) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, 
+                               const Scalar& m10, const Scalar& m11)
+      : vx(m00,m10), vy(m01,m11) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+    __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace2 scale(const Vector& s) {
+      return LinearSpace2(s.x,   0,
+                          0  , s.y);
+    }
+
+    /*! return matrix for rotation */
+    static __forceinline LinearSpace2 rotate(const Scalar& r) {
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace2(c, -s,
+                          s,  c);
+    }
+
+    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+    LinearSpace2 orthogonal() const 
+    {
+      LinearSpace2 m = *this;
+
+      // mirrored?
+      Scalar mirror(one);
+      if (m.det() < Scalar(zero)) {
+        m.vx = -m.vx;
+        mirror = -mirror;
+      }
+
+      // rotation
+      for (int i = 0; i < 99; i++) {
+        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+        const LinearSpace2 d = m_next - m;
+        m = m_next;
+        // norm^2 of difference small enough?
+        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+          break;
+      }
+
+      // rotation * mirror_x
+      return LinearSpace2(mirror*m.vx, m.vy);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+  template<typename T> __forceinline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
+  template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace2<Vec2f> LinearSpace2f;
+  typedef LinearSpace2<Vec2fa> LinearSpace2fa;
+}
--- a/thirdparty/embree-aarch64/common/math/linearspace3.h
+++ b/thirdparty/embree-aarch64/common/math/linearspace3.h
@ -0,0 +1,213 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "quaternion.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3D Linear Transform (3x3 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace3
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace3           ( ) {}
+    __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+    __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+    template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
+      : vx(vx), vy(vy), vz(vz) {}
+
+    /*! construction from quaternion */
+    __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
+      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
+                               const Scalar& m10, const Scalar& m11, const Scalar& m12,
+                               const Scalar& m20, const Scalar& m21, const Scalar& m22)
+      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
+
+    /*! returns third row of matrix */
+    __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+    __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace3 scale(const Vector& s) {
+      return LinearSpace3(s.x,   0,   0,
+                          0  , s.y,   0,
+                          0  ,   0, s.z);
+    }
+
+    /*! return matrix for rotation around arbitrary axis */
+    static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
+      Vector u = normalize(_u);
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
+                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
+                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy,vz;
+  };
+
+  /*! compute transposed matrix */
+  template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
+    vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
+    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+  }
+
+  template<typename T>
+    __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
+    return xfm.transposed();
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+  /* constructs a coordinate frame form a normalized normal */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N) 
+  {
+    const T dx0(0,N.z,-N.y);
+    const T dx1(-N.z,0,N.x);
+    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+
+  /* constructs a coordinate frame from a normal and approximate x-direction */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
+  {
+    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+    const T dx = normalize(cross(dxi,N));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+  
+  /* clamps linear space to range -1 to +1 */
+  template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+                           clamp(space.vy,T(-1.0f),T(1.0f)),
+                           clamp(space.vz,T(-1.0f),T(1.0f)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+  template<typename T> __forceinline T               operator*(const LinearSpace3<T>& a, const T              & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
+  template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+  template<typename T> __forceinline T       xfmPoint (const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmVector(const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmNormal(const LinearSpace3<T>& s, const T      & a) { return xfmVector(s.inverse().transposed(),a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
+    return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
+  }
+
+  /*! blending */
+  template<typename T>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) 
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace3<Vec3f> LinearSpace3f;
+  typedef LinearSpace3<Vec3fa> LinearSpace3fa;
+  typedef LinearSpace3<Vec3fx> LinearSpace3fx;
+  typedef LinearSpace3<Vec3ff> LinearSpace3ff;
+
+  template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
+  typedef LinearSpace3<Vec3<vfloat<4>>>  LinearSpace3vf4;
+  typedef LinearSpace3<Vec3<vfloat<8>>>  LinearSpace3vf8;
+  typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
+
+  /*! blending */
+  template<typename T, typename S>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
+                                       const LinearSpace3<T>& l1,
+                                       const S& t)
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+}
--- a/thirdparty/embree-aarch64/common/math/math.h
+++ b/thirdparty/embree-aarch64/common/math/math.h
@ -0,0 +1,451 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+#if defined(__ARM_NEON)
+#include "SSE2NEON.h"
+#if defined(NEON_AVX2_EMULATION)
+#include "AVX2NEON.h"
+#endif
+#else
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(__WIN32__) && !defined(__MINGW32__)
+#if (__MSV_VER <= 1700)
+namespace std
+{
+  __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
+  __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
+  __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
+}
+#endif
+#endif
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    union { float f; int i; } v; v.f = f; return v.i;
+  }
+
+  __forceinline float cast_i2f(int i) {
+    union { float f; int i; } v; v.i = i; return v.f;
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+#if defined(__WIN32__) && !defined(__MINGW32__)
+  __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+#endif
+
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x )
+  {
+#if defined(__aarch64__)
+      // Move scalar to vector register and do rcp.
+      __m128 a;
+      a[0] = x;
+      float32x4_t reciprocal = vrecpeq_f32(a);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      return reciprocal[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rcp_ss(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#else
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#endif
+
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = 0x80000000;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
+  }
+  __forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128 b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_xor_ps(a, b);
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
+  }
+  __forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__) 
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
+  }
+  __forceinline float rsqrt( const float x )
+  {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      a[0] = x;
+      __m128 value = _mm_rsqrt_ps(a);
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      return value[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rsqrt_ss(a);
+#endif
+    const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+                                _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+    return _mm_cvtss_f32(c);
+#endif
+  }
+
+#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__)
+  __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  __forceinline int roundf(float f) { return (int)(f + 0.5f); }
+#else
+  __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+  __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
+#endif
+
+  __forceinline float abs  ( const float x ) { return ::fabsf(x); }
+  __forceinline float acos ( const float x ) { return ::acosf (x); }
+  __forceinline float asin ( const float x ) { return ::asinf (x); }
+  __forceinline float atan ( const float x ) { return ::atanf (x); }
+  __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
+  __forceinline float cos  ( const float x ) { return ::cosf  (x); }
+  __forceinline float cosh ( const float x ) { return ::coshf (x); }
+  __forceinline float exp  ( const float x ) { return ::expf  (x); }
+  __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
+  __forceinline float log  ( const float x ) { return ::logf  (x); }
+  __forceinline float log10( const float x ) { return ::log10f(x); }
+  __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); }
+  __forceinline float sin  ( const float x ) { return ::sinf  (x); }
+  __forceinline float sinh ( const float x ) { return ::sinhf (x); }
+  __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
+  __forceinline float tan  ( const float x ) { return ::tanf  (x); }
+  __forceinline float tanh ( const float x ) { return ::tanhf (x); }
+  __forceinline float floor( const float x ) { return ::floorf (x); }
+  __forceinline float ceil ( const float x ) { return ::ceilf (x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  __forceinline double abs  ( const double x ) { return ::fabs(x); }
+  __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  __forceinline double acos ( const double x ) { return ::acos (x); }
+  __forceinline double asin ( const double x ) { return ::asin (x); }
+  __forceinline double atan ( const double x ) { return ::atan (x); }
+  __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  __forceinline double cos  ( const double x ) { return ::cos  (x); }
+  __forceinline double cosh ( const double x ) { return ::cosh (x); }
+  __forceinline double exp  ( const double x ) { return ::exp  (x); }
+  __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  __forceinline double log  ( const double x ) { return ::log  (x); }
+  __forceinline double log10( const double x ) { return ::log10(x); }
+  __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  __forceinline double rcp  ( const double x ) { return 1.0/x; }
+  __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  __forceinline double sin  ( const double x ) { return ::sin  (x); }
+  __forceinline double sinh ( const double x ) { return ::sinh (x); }
+  __forceinline double sqr  ( const double x ) { return x*x; }
+  __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  __forceinline double tan  ( const double x ) { return ::tan  (x); }
+  __forceinline double tanh ( const double x ) { return ::tanh (x); }
+  __forceinline double floor( const double x ) { return ::floor (x); }
+  __forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+#if defined(__aarch64__) 
+    __forceinline float mini(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_min_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__aarch64__) 
+    __forceinline float maxi(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_max_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return a<b ? a:b; }
+  __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
+  __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
+  __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return a<b ? b:a; }
+  __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
+  __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
+  __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+#if defined(__MACOSX__)
+  __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
+  __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
+#endif
+
+#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
+  __forceinline void sincosf(float x, float *sin, float *cos) {
+    __sincosf(x,sin,cos);
+  }
+#endif
+
+#if defined(__WIN32__) || defined(__FreeBSD__)
+  __forceinline void sincosf(float x, float *s, float *c) {
+    *s = sinf(x); *c = cosf(x);
+  }
+#endif
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+#if defined(__AVX2__)
+  __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+
+
+__forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+
+#pragma clang fp contract(on)
+#else
+  __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
+  __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
+#endif
+
+  /*! random functions */
+  template<typename T> T random() { return T(0); }
+#if defined(_WIN32)
+  template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
+#else
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+#endif
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+
+#if _WIN32
+  __forceinline double drand48() {
+    return double(rand())/double(RAND_MAX);
+  }
+
+  __forceinline void srand48(long seed) {
+    return srand(seed);
+  }
+#endif
+
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool all(bool s) { return s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+
+  template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) {
+#if 1//!defined(__aarch64__)
+      return msub(a,b,c*d);
+#else
+      return nmadd(c,d,a*b);
+#endif
+  }
+
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	  T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+#if defined(__AVX2__) && !defined(__aarch64__)
+
+  template<>
+    __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
+  {
+    const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
+    const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
+    const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
+    return xx | yy | zz;
+  }
+
+#endif
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
--- a/thirdparty/embree-aarch64/common/math/obbox.h
+++ b/thirdparty/embree-aarch64/common/math/obbox.h
@ -0,0 +1,39 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "linearspace3.h"
+
+namespace embree
+{
+  /*! Oriented bounding box */
+  template<typename T>
+    struct OBBox 
+  {
+  public:
+    
+    __forceinline OBBox () {}
+    
+    __forceinline OBBox (EmptyTy) 
+      : space(one), bounds(empty) {}
+    
+    __forceinline OBBox (const BBox<T>& bounds) 
+      : space(one), bounds(bounds) {}
+      
+    __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) 
+      : space(space), bounds(bounds) {}
+    
+    friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
+      return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
+    }
+    
+  public:
+    LinearSpace3<T> space; //!< orthonormal transformation
+    BBox<T> bounds;        //!< bounds in transformed space
+  };
+
+  typedef OBBox<Vec3f> OBBox3f;
+  typedef OBBox<Vec3fa> OBBox3fa;
+}
--- a/thirdparty/embree-aarch64/common/math/quaternion.h
+++ b/thirdparty/embree-aarch64/common/math/quaternion.h
@ -0,0 +1,254 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "vec4.h"
+
+#include "transcendental.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////
+  // Quaternion Struct
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T>
+  struct QuaternionT
+  {
+    typedef Vec3<T> Vector;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT           ()                     { }
+    __forceinline QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+    __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+    __forceinline          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
+    __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+    __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
+    __forceinline          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+    __forceinline          QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+    __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
+    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+    __forceinline QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
+
+    /*! return quaternion for rotation around arbitrary axis */
+    static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
+      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+    }
+
+    /*! returns the rotation axis of the quaternion as a vector */
+    __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
+
+  public:
+    T r, i, j, k;
+  };
+
+  template<typename T> __forceinline QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+  ////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(msub(a, q.r, p.r),
+                          msub(a, q.i, p.i),
+                          msub(a, q.j, p.j),
+                          msub(a, q.k, p.k));
+  }
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(madd(a, q.r, p.r),
+                          madd(a, q.i, p.i),
+                          madd(a, q.j, p.j),
+                          madd(a, q.k, p.k));
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+  template<typename T> __forceinline Vec3<T>       operator *( const QuaternionT<T>& a, const Vec3<T>      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+  }
+  template<typename T> __forceinline QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+  template<typename T, typename M> __forceinline QuaternionT<T>
+  select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(select(m, q.r, p.r),
+                          select(m, q.i, p.i),
+                          select(m, q.j, p.j),
+                          select(m, q.k, p.k));
+  }
+
+
+  template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+  template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Orientation Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
+  {
+    if ( vx.x + vy.y + vz.z >= T(zero) )
+    {
+      const T t = T(one) + (vx.x + vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = t*s;
+      i = (vy.z - vz.y)*s;
+      j = (vz.x - vx.z)*s;
+      k = (vx.y - vy.x)*s;
+    }
+    else if ( vx.x >= max(vy.y, vz.z) )
+    {
+      const T t = (T(one) + vx.x) - (vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vy.z - vz.y)*s;
+      i = t*s;
+      j = (vx.y + vy.x)*s;
+      k = (vz.x + vx.z)*s;
+    }
+    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+    {
+      const T t = (T(one) + vy.y) - (vz.z + vx.x);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vz.x - vx.z)*s;
+      i = (vx.y + vy.x)*s;
+      j = t*s;
+      k = (vy.z + vz.y)*s;
+    }
+    else //if ( vz.z >= max(vy.y, vx.x) )
+    {
+      const T t = (T(one) + vz.z) - (vx.x + vy.y);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vx.y - vy.x)*s;
+      i = (vz.x + vx.z)*s;
+      j = (vy.z + vz.y)*s;
+      k = t*s;
+    }
+  }
+
+  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+  {
+    const T cya = cos(yaw  *T(0.5f));
+    const T cpi = cos(pitch*T(0.5f));
+    const T cro = cos(roll *T(0.5f));
+    const T sya = sin(yaw  *T(0.5f));
+    const T spi = sin(pitch*T(0.5f));
+    const T sro = sin(roll *T(0.5f));
+    r = cro*cya*cpi + sro*sya*spi;
+    i = cro*cya*spi + sro*sya*cpi;
+    j = cro*sya*cpi - sro*cya*spi;
+    k = sro*cya*cpi - cro*sya*spi;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  //////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
+    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+  }
+
+  /*! default template instantiations */
+  typedef QuaternionT<float>  Quaternion3f;
+  typedef QuaternionT<double> Quaternion3d;
+
+  template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
+  typedef QuaternionT<vfloat<4>>  Quaternion3vf4;
+  typedef QuaternionT<vfloat<8>>  Quaternion3vf8;
+  typedef QuaternionT<vfloat<16>> Quaternion3vf16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T>
+  __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
+                                   const QuaternionT<T>& q1,
+                                   const T& factor)
+  {
+    QuaternionT<T> q;
+    q.r = lerp(q0.r, q1.r, factor);
+    q.i = lerp(q0.i, q1.i, factor);
+    q.j = lerp(q0.j, q1.j, factor);
+    q.k = lerp(q0.k, q1.k, factor);
+    return q;
+  }
+
+  template<typename T>
+  __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
+                                     const QuaternionT<T>& q1_,
+                                     const T& t)
+  {
+    T cosTheta = dot(q0, q1_);
+    QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
+    cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
+    if (unlikely(all(cosTheta > 0.9995f))) {
+      return normalize(lerp(q0, q1, t));
+    }
+    const T phi = t * fastapprox::acos(cosTheta);
+    T sinPhi, cosPhi;
+    fastapprox::sincos(phi, sinPhi, cosPhi);
+    QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
+    return msub(cosPhi, q0, qperp);
+  }
+}
--- a/thirdparty/embree-aarch64/common/math/range.h
+++ b/thirdparty/embree-aarch64/common/math/range.h
@ -0,0 +1,137 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  template<typename Ty>
+    struct range 
+    {
+      __forceinline range() {}
+
+      __forceinline range(const Ty& begin)
+        : _begin(begin), _end(begin+1) {}
+      
+      __forceinline range(const Ty& begin, const Ty& end)
+        : _begin(begin), _end(end) {}
+ 
+      __forceinline range(const range& other)
+        : _begin(other._begin), _end(other._end) {}
+
+      template<typename T1>
+      __forceinline range(const range<T1>& other)
+        : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
+
+      template<typename T1>
+      __forceinline range& operator =(const range<T1>& other) {
+        _begin = other._begin;
+        _end = other._end;
+        return *this;
+      }
+      
+      __forceinline Ty begin() const {
+        return _begin;
+      }
+      
+      __forceinline Ty end() const {
+	return _end;
+      }
+
+      __forceinline range intersect(const range& r) const {
+        return range (max(_begin,r._begin),min(_end,r._end));
+      }
+
+      __forceinline Ty size() const {
+        return _end - _begin;
+      }
+
+      __forceinline bool empty() const { 
+        return _end <= _begin; 
+      }
+
+      __forceinline Ty center() const {
+        return (_begin + _end)/2;
+      }
+
+      __forceinline std::pair<range,range> split() const 
+      {
+        const Ty _center = center();
+        return std::make_pair(range(_begin,_center),range(_center,_end));
+      }
+
+      __forceinline void split(range& left_o, range& right_o) const 
+      {
+        const Ty _center = center();
+        left_o = range(_begin,_center);
+        right_o = range(_center,_end);
+      }
+
+      __forceinline friend bool operator< (const range& r0, const range& r1) {
+        return r0.size() < r1.size();
+      }
+	
+      friend embree_ostream operator<<(embree_ostream cout, const range& r) {
+        return cout << "range [" << r.begin() << ", " << r.end() << "]";
+      }
+      
+      Ty _begin, _end;
+    };
+
+  template<typename Ty>
+    range<Ty> make_range(const Ty& begin, const Ty& end) {
+    return range<Ty>(begin,end);
+  }
+
+  template<typename Ty>
+    struct extended_range : public range<Ty>
+    {
+      __forceinline extended_range () {}
+
+      __forceinline extended_range (const Ty& begin)
+        : range<Ty>(begin), _ext_end(begin+1) {}
+      
+      __forceinline extended_range (const Ty& begin, const Ty& end)
+        : range<Ty>(begin,end), _ext_end(end) {}
+
+      __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
+        : range<Ty>(begin,end), _ext_end(ext_end) {}
+      
+      __forceinline Ty ext_end() const {
+	return _ext_end;
+      }
+
+      __forceinline Ty ext_size() const {
+        return _ext_end - range<Ty>::_begin;
+      }
+
+      __forceinline Ty ext_range_size() const {
+        return _ext_end - range<Ty>::_end;
+      }
+
+      __forceinline bool has_ext_range() const {
+        assert(_ext_end >= range<Ty>::_end);
+        return (_ext_end - range<Ty>::_end) > 0;
+      }
+
+      __forceinline void set_ext_range(const size_t ext_end){
+        assert(ext_end >= range<Ty>::_end);
+        _ext_end = ext_end;
+      }
+
+      __forceinline void move_right(const size_t plus){
+        range<Ty>::_begin   += plus;
+        range<Ty>::_end     += plus;
+        _ext_end += plus;
+      }
+
+      friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
+        return cout << "extended_range [" << r.begin() << ", " << r.end() <<  " (" << r.ext_end() << ")]";
+      }
+      
+      Ty _ext_end;
+    };
+}
--- a/thirdparty/embree-aarch64/common/math/transcendental.h
+++ b/thirdparty/embree-aarch64/common/math/transcendental.h
@ -0,0 +1,525 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Transcendental functions from "ispc": https://github.com/ispc/ispc/
+// Most of the transcendental implementations in ispc code come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+#include "../simd/simd.h"
+
+namespace embree
+{
+
+namespace fastapprox
+{
+
+template <typename T>
+__forceinline T sin(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+  auto flipSign = (kMod4 > 1);
+
+  // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+  // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+  static const float sinC2  = -0.16666667163372039794921875;
+  static const float sinC4  = +8.333347737789154052734375e-3;
+  static const float sinC6  = -1.9842604524455964565277099609375e-4;
+  static const float sinC8  = +2.760012648650445044040679931640625e-6;
+  static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  static const float cosC2  = -0.5;
+  static const float cosC4  = +4.166664183139801025390625e-2;
+  static const float cosC6  = -1.388833043165504932403564453125e-3;
+  static const float cosC8  = +2.47562347794882953166961669921875e-5;
+  static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(sinUseCos, 1., x);
+  auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline T cos(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+
+  auto kMod4 = k & 3;
+  auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
+  auto flipSign = (kMod4 == 1 | kMod4 == 2);
+
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(cosUseCos, 1., x);
+  auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
+  auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
+  auto sinFlipSign = (kMod4 > 1);
+  auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
+
+  const float oneVec = +1.;
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto x2 = x * x;
+
+  auto sinFormula = x2 * sinC10 + sinC8;
+  auto cosFormula = x2 * cosC10 + cosC8;
+  sinFormula = x2 * sinFormula + sinC6;
+  cosFormula = x2 * cosFormula + cosC6;
+
+  sinFormula = x2 * sinFormula + sinC4;
+  cosFormula = x2 * cosFormula + cosC4;
+
+  sinFormula = x2 * sinFormula + sinC2;
+  cosFormula = x2 * cosFormula + cosC2;
+
+  sinFormula = x2 * sinFormula + oneVec;
+  cosFormula = x2 * cosFormula + oneVec;
+
+  sinFormula *= x;
+
+  sinResult = select(sinUseCos, cosFormula, sinFormula);
+  cosResult = select(cosUseCos, cosFormula, sinFormula);
+
+  sinResult = select(sinFlipSign, -sinResult, sinResult);
+  cosResult = select(cosFlipSign, -cosResult, cosResult);
+}
+
+template <typename T>
+__forceinline T tan(const T &v)
+{
+  const float piOverFourVec = 0.785398185253143310546875;
+  const float fourOverPiVec = 1.27323949337005615234375;
+
+  auto xLt0 = v < 0.;
+  auto y = select(xLt0, -v, v);
+  auto scaled = y * fourOverPiVec;
+
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  auto x = y - kReal * piOverFourVec;
+
+  // If k & 1, x -= Pi/4
+  auto needOffset = (k & 1) != 0;
+  x = select(needOffset, x - piOverFourVec, x);
+
+  // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+  auto kMod4 = k & 3;
+  auto useCotan = (kMod4 == 1) | (kMod4 == 2);
+
+  const float oneVec = 1.0;
+
+  const float tanC2  = +0.33333075046539306640625;
+  const float tanC4  = +0.13339905440807342529296875;
+  const float tanC6  = +5.3348250687122344970703125e-2;
+  const float tanC8  = +2.46033705770969390869140625e-2;
+  const float tanC10 = +2.892402000725269317626953125e-3;
+  const float tanC12 = +9.500005282461643218994140625e-3;
+
+  const float cotC2  = -0.3333333432674407958984375;
+  const float cotC4  = -2.222204394638538360595703125e-2;
+  const float cotC6  = -2.11752182804048061370849609375e-3;
+  const float cotC8  = -2.0846328698098659515380859375e-4;
+  const float cotC10 = -2.548247357481159269809722900390625e-5;
+  const float cotC12 = -3.5257363606433500535786151885986328125e-7;
+
+  auto x2 = x * x;
+  T z;
+  if (any(useCotan))
+  {
+    auto cotVal = x2 * cotC12 + cotC10;
+    cotVal = x2 * cotVal + cotC8;
+    cotVal = x2 * cotVal + cotC6;
+    cotVal = x2 * cotVal + cotC4;
+    cotVal = x2 * cotVal + cotC2;
+    cotVal = x2 * cotVal + oneVec;
+    // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+    cotVal /= -x;
+    z = cotVal;
+  }
+  auto useTan = !useCotan;
+  if (any(useTan))
+  {
+    auto tanVal = x2 * tanC12 + tanC10;
+    tanVal = x2 * tanVal + tanC8;
+    tanVal = x2 * tanVal + tanC6;
+    tanVal = x2 * tanVal + tanC4;
+    tanVal = x2 * tanVal + tanC2;
+    tanVal = x2 * tanVal + oneVec;
+    // Equation was for tan(x)/x
+    tanVal *= x;
+    z = select(useTan, tanVal, z);
+  }
+  return select(xLt0, -z, z);
+}
+
+template <typename T>
+__forceinline T asin(const T &x0)
+{
+  auto isneg = (x0 < 0.f);
+  auto x = abs(x0);
+  auto isnan = (x > 1.f);
+
+  // sollya
+  // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
+  //           [1e-20;.9999999999999999]);
+  // avg error: 1.1105439e-06, max error 1.3187528e-06
+  auto v = 1.57079517841339111328125f +
+           x * (-0.21450997889041900634765625f +
+                x * (8.78556668758392333984375e-2f +
+                     x * (-4.489909112453460693359375e-2f +
+                          x * (1.928029954433441162109375e-2f +
+                               x * (-4.3095736764371395111083984375e-3f)))));
+
+  v *= -sqrt(1.f - x);
+  v = v + 1.57079637050628662109375f;
+
+  v = select(v < 0.f, T(0.f), v);
+  v = select(isneg, -v, v);
+  v = select(isnan, T(cast_i2f(0x7fc00000)), v);
+
+  return v;
+}
+
+template <typename T>
+__forceinline T acos(const T &v)
+{
+  return 1.57079637050628662109375f - asin(v);
+}
+
+template <typename T>
+__forceinline T atan(const T &v)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  // atan(-x) = -atan(x) (so flip from negative to positive first)
+  // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
+  auto xNeg = v < 0.f;
+  auto xFlipped = select(xNeg, -v, v);
+
+  auto xGt1 = xFlipped > 1.;
+  auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
+
+  // These coefficients approximate atan(x)/x
+  const float atanC0  = +0.99999988079071044921875;
+  const float atanC2  = -0.3333191573619842529296875;
+  const float atanC4  = +0.199689209461212158203125;
+  const float atanC6  = -0.14015688002109527587890625;
+  const float atanC8  = +9.905083477497100830078125e-2;
+  const float atanC10 = -5.93664981424808502197265625e-2;
+  const float atanC12 = +2.417283318936824798583984375e-2;
+  const float atanC14 = -4.6721356920897960662841796875e-3;
+
+  auto x2 = x * x;
+  auto result = x2 * atanC14 + atanC12;
+  result = x2 * result + atanC10;
+  result = x2 * result + atanC8;
+  result = x2 * result + atanC6;
+  result = x2 * result + atanC4;
+  result = x2 * result + atanC2;
+  result = x2 * result + atanC0;
+  result *= x;
+
+  result = select(xGt1, piOverTwoVec - result, result);
+  result = select(xNeg, -result, result);
+  return result;
+}
+
+template <typename T>
+__forceinline T atan2(const T &y, const T &x)
+{
+  const float piVec = 3.1415926536;
+  // atan2(y, x) =
+  //
+  // atan2(y > 0, x = +-0) ->  Pi/2
+  // atan2(y < 0, x = +-0) -> -Pi/2
+  // atan2(y = +-0, x < +0) -> +-Pi
+  // atan2(y = +-0, x >= +0) -> +-0
+  //
+  // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
+  // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
+  // atan2(y, x > 0) -> atan(y/x)
+  //
+  // and then a bunch of code for dealing with infinities.
+  auto yOverX = y * rcpSafe(x);
+  auto atanArg = atan(yOverX);
+  auto xLt0 = x < 0.f;
+  auto yLt0 = y < 0.f;
+  auto offset = select(xLt0,
+                select(yLt0, T(-piVec), T(piVec)), 0.f);
+  return offset + atanArg;
+}
+
+template <typename T>
+__forceinline T exp(const T &v)
+{
+  const float ln2Part1 = 0.6931457519;
+  const float ln2Part2 = 1.4286067653e-6;
+  const float oneOverLn2 = 1.44269502162933349609375;
+
+  auto scaled = v * oneOverLn2;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * ln2Part1;
+  x -= kReal * ln2Part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.;
+  const float c2 = 0.4999999105930328369140625;
+  const float c3 = 0.166668415069580078125;
+  const float c4 = 4.16539050638675689697265625e-2;
+  const float c5 = 8.378830738365650177001953125e-3;
+  const float c6 = 1.304379315115511417388916015625e-3;
+  const float c7 = 2.7555381529964506626129150390625e-4;
+
+  auto result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  auto biasedN = k + fpbias;
+  auto overflow = kReal > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  auto underflow = kReal <= -fpbias;
+  const int infBits = 0x7f800000;
+  biasedN <<= 23;
+  // Reinterpret this thing as float
+  auto twoToTheN = asFloat(biasedN);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  auto elemtype2n = twoToTheN;
+  result *= elemtype2n;
+  result = select(overflow, cast_i2f(infBits), result);
+  result = select(underflow, 0., result);
+  return result;
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
+template <typename T, typename R>
+__forceinline void __rangeReduceLog(const T &input,
+                                    T &reduced,
+                                    R &exponent)
+{
+  auto intVersion = asInt(input);
+  // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+  // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+  //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+  // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+  //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+  //const int exponentMask(0x7F800000)
+  static const int nonexponentMask = 0x807FFFFF;
+
+  // We want the reduced version to have an exponent of -1 which is
+  // -1 + 127 after biasing or 126
+  static const int exponentNeg1 = (126l << 23);
+  // NOTE(boulos): We don't need to mask anything out since we know
+  // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+  // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+  auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
+
+  auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+  exponent = offsetExponent - 127;          // get the real value
+
+  // Blend the offset_exponent with the original input (do this in
+  // int for now, until I decide if float can have & and &not)
+  auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
+  reduced = asFloat(blended);
+}
+
+template <typename T> struct ExponentType            { };
+template <int N>      struct ExponentType<vfloat<N>> { typedef vint<N> Ty; };
+template <>           struct ExponentType<float>     { typedef int     Ty; };
+
+template <typename T>
+__forceinline T log(const T &v)
+{
+  T reduced;
+  typename ExponentType<T>::Ty exponent;
+
+  const int nanBits = 0x7fc00000;
+  const int negInfBits = 0xFF800000;
+  const float nan = cast_i2f(nanBits);
+  const float negInf = cast_i2f(negInfBits);
+  auto useNan = v < 0.;
+  auto useInf = v == 0.;
+  auto exceptional = useNan | useInf;
+  const float one = 1.0;
+
+  auto patched = select(exceptional, one, v);
+  __rangeReduceLog(patched, reduced, exponent);
+
+  const float ln2 = 0.693147182464599609375;
+
+  auto x1 = one - reduced;
+  const float c1 = +0.50000095367431640625;
+  const float c2 = +0.33326041698455810546875;
+  const float c3 = +0.2519190013408660888671875;
+  const float c4 = +0.17541764676570892333984375;
+  const float c5 = +0.3424419462680816650390625;
+  const float c6 = -0.599632322788238525390625;
+  const float c7 = +1.98442304134368896484375;
+  const float c8 = -2.4899270534515380859375;
+  const float c9 = +1.7491014003753662109375;
+
+  auto result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += toFloat(exponent) * ln2;
+
+  return select(exceptional,
+                select(useNan, T(nan), T(negInf)),
+                result);
+}
+
+template <typename T>
+__forceinline T pow(const T &x, const T &y)
+{
+  auto x1 = abs(x);
+  auto z = exp(y * log(x1));
+
+  // Handle special cases
+  const float twoOver23 = 8388608.0f;
+  auto yInt = y == round(y);
+  auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
+
+  // x == 0
+  z = select(x == 0.0f,
+      select(y < 0.0f, T(inf) | signmsk(x),
+      select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
+
+  // x < 0
+  auto xNegative = x < 0.0f;
+  if (any(xNegative))
+  {
+    auto z1 = z | asFloat(yOddInt);
+    z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
+    z = select(xNegative, z1, z);
+  }
+
+  auto xFinite = isfinite(x);
+  auto yFinite = isfinite(y);
+  if (all(xFinite & yFinite))
+    return z;
+
+  // x finite and y infinite
+  z = select(andn(xFinite, yFinite),
+      select(x1 == 1.0f, 1.0f,
+      select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
+
+  // x infinite
+  z = select(xFinite, z,
+      select(y == 0.0f, 1.0f,
+      select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
+
+  return z;
+}
+
+template <typename T>
+__forceinline T pow(const T &x, float y)
+{
+  return pow(x, T(y));
+}
+
+} // namespace fastapprox
+
+} // namespace embree
--- a/thirdparty/embree-aarch64/common/math/vec2.h
+++ b/thirdparty/embree-aarch64/common/math/vec2.h
@ -0,0 +1,235 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec2fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 2D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec2
+  {
+    enum { N = 2 };
+    union {
+      struct { T x, y; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ) {}
+    __forceinline explicit Vec2( const T& a             ) : x(a), y(a) {}
+    __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
+
+    __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
+    __forceinline Vec2( const Vec2fa& other );
+
+    template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
+    template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ZeroTy   ) : x(zero), y(zero) {}
+    __forceinline Vec2( OneTy    ) : x(one),  y(one) {}
+    __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
+    __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
+
+#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 2); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis )      { assert(axis < 2); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
+  template<typename T> __forceinline Vec2<T> abs       ( const Vec2<T>& a ) { return Vec2<T>(abs  (a.x), abs  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rcp       ( const Vec2<T>& a ) { return Vec2<T>(rcp  (a.x), rcp  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rsqrt     ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
+  template<typename T> __forceinline Vec2<T> sqrt      ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
+  template<typename T> __forceinline Vec2<T> frac      ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x + b  , a.y + b  ); }
+  template<typename T> __forceinline Vec2<T> operator +( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   + b.x, a   + b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x - b  , a.y - b  ); }
+  template<typename T> __forceinline Vec2<T> operator -( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   - b.x, a   - b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   * b.x, a   * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x * b  , a.y * b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x / b  , a.y / b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   / b.x, a   / b.y); }
+
+  template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
+  template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> madd  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
+
+  template<typename T> __forceinline Vec2<T> madd  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; return a; }
+  template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
+  template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
+  template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
+  template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
+  template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
+  template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
+    return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
+  template<typename T> __forceinline Vec2<T> cross    ( const Vec2<T>& a )                   { return Vec2<T>(-a.y,a.x); } 
+  template<typename T> __forceinline T       length   ( const Vec2<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline T       det      ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
+
+  template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T>
+    __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
+    return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
+  {
+    const Vec2<T> b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec2<bool > Vec2b;
+  typedef Vec2<int  > Vec2i;
+  typedef Vec2<float> Vec2f;
+}
+
+#include "vec2fa.h"
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+}
--- a/thirdparty/embree-aarch64/common/math/vec2fa.h
+++ b/thirdparty/embree-aarch64/common/math/vec2fa.h
@ -0,0 +1,317 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    union {
+      __m128 m128;
+      struct { float x,y,az,aw; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+
+    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
+      _mm_storeu_ps((float*)ptr,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) {
+    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+  }
+
+  __forceinline Vec2fa rcp  ( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 reciprocal = _mm_rcp_ps(a.m128);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        return (const Vec2fa)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Vec2fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec2fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+#endif  //defined(__aarch64__) 
+  }
+
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+
+  __forceinline Vec2fa rsqrt( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+        
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+        
+#endif
+  }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(logf(a.x),logf(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(expf(a.x),expf(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+  }
+#else
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+#elif defined (__SSE4_1__)
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+#else
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  typedef Vec2fa Vec2fa_t;
+}
--- a/thirdparty/embree-aarch64/common/math/vec3.h
+++ b/thirdparty/embree-aarch64/common/math/vec3.h
@ -0,0 +1,349 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec3fa;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 3D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec3
+  {
+    enum { N  = 3 };
+
+    union {
+      struct {
+	T x, y, z;
+      };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ) {}
+    __forceinline explicit Vec3( const T& a                         ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
+
+    __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3( const Vec3fa& other );
+
+    template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
+    template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
+	
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ZeroTy   ) : x(zero), y(zero), z(zero) {}
+    __forceinline Vec3( OneTy    ) : x(one),  y(one),  z(one) {}
+    __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
+    __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+    __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
+    __forceinline       T& operator []( const size_t axis )       { assert(axis < 3); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 3); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
+  template<typename T> __forceinline Vec3<T> abs       ( const Vec3<T>& a ) { return Vec3<T>(abs  (a.x), abs  (a.y), abs  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rcp       ( const Vec3<T>& a ) { return Vec3<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rsqrt     ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
+  template<typename T> __forceinline Vec3<T> sqrt      ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
+
+  template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
+  {
+    return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
+                   select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
+                   select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
+  }
+  template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   * b.x, a   * b.y, a   * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x * b  , a.y * b  , a.z * b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x / b  , a.y / b  , a.z / b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   / b.x, a   / b.y, a   / b.z); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
+
+  template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
+  template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
+
+  template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
+  template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> madd  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
+
+  template<typename T> __forceinline Vec3<T> madd  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T        b ) { a.x += b;   a.y += b;   a.z += b;   return a; }
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; return a; }
+  template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
+  template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
+  template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
+  template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+  template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
+    return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T>
+    __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
+    return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
+  {
+    const Vec3<T> b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
+  template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
+  template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
+  template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
+  template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
+  template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
+  template<typename T> __forceinline T       dot      ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
+  template<typename T> __forceinline T       length   ( const Vec3<T>& a )                   { return sqrt(sqr(a)); }
+  template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
+  template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
+  template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); }
+  template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
+  {
+    const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
+    const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
+    const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
+    const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
+    const auto sx = abs(ab_x) < abs(bc_x);
+    const auto sy = abs(ab_y) < abs(bc_y);
+    const auto sz = abs(ab_z) < abs(bc_z);
+    return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
+                   select(sy,cross_ab.y,cross_bc.y),
+                   select(sz,cross_ab.z,cross_bc.z));
+  }
+
+  template<typename T> __forceinline T       sum      ( const Vec3<T>& a )                   { return a.x+a.y+a.z; }
+
+  template<typename T> __forceinline      T  halfArea ( const Vec3<T>& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  template<typename T> __forceinline      T  area     ( const Vec3<T>& d )                  { return 2.0f*halfArea(d); }
+
+  template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ), a ,  a*rsqrt(d) );
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
+  {
+    const Vec3<T> N = cross(P-Q0,Q1-Q0);
+    const Vec3<T> D = Q1-Q0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
+  {
+    const Vec3<T> N = cross(PmQ0,Q1mQ0);
+    const Vec3<T> D = Q1mQ0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3<bool > Vec3b;
+  typedef Vec3<int  > Vec3i;
+  typedef Vec3<float> Vec3f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<typename Out, typename In>
+  __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
+    return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
+  }
+
+  template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#elif defined(__SSE__) || defined(__ARM_NEON)
+  template<>
+  __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+  }
+#endif
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<>
+  __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
+    return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
+    return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
+#endif
+}
--- a/Show more
+++ b/Show more