From 77a045e902db9f5f0ea7961e827c337541a6436c Mon Sep 17 00:00:00 2001
From: reduz <reduzio@gmail.com>
Date: Sat, 12 Dec 2020 09:06:59 -0300
Subject: [PATCH] Rework Mesh handling on scene importing.

-Reworked how meshes are treated by importer by using EditorSceneImporterMesh and EditorSceneImporterMeshNode. Instead of Mesh and MeshInstance, this allows more efficient processing of meshes before they are actually registered in the RenderingServer.
-Integrated MeshOptimizer
-Reworked internals of SurfaceTool to use arrays, making it more performant and easy to run optimizatons on.
---
 editor/editor_node.cpp                        |    3 +
 editor/import/editor_import_collada.cpp       |   39 +-
 editor/import/editor_scene_importer_gltf.cpp  |   37 +-
 editor/import/editor_scene_importer_gltf.h    |    8 +-
 editor/import/resource_importer_obj.cpp       |   26 +-
 editor/import/resource_importer_scene.cpp     |  328 ++++
 editor/import/resource_importer_scene.h       |   87 +
 modules/meshoptimizer/SCsub                   |   34 +
 modules/meshoptimizer/config.py               |    7 +
 modules/meshoptimizer/register_types.cpp      |   43 +
 modules/meshoptimizer/register_types.h        |   37 +
 scene/resources/mesh.cpp                      |   10 +-
 scene/resources/mesh.h                        |   13 +-
 scene/resources/surface_tool.cpp              |  355 ++--
 scene/resources/surface_tool.h                |   28 +-
 thirdparty/README.md                          |   10 +
 thirdparty/meshoptimizer/LICENSE.md           |   21 +
 thirdparty/meshoptimizer/allocator.cpp        |    8 +
 thirdparty/meshoptimizer/clusterizer.cpp      |  351 ++++
 thirdparty/meshoptimizer/indexcodec.cpp       |  752 ++++++++
 thirdparty/meshoptimizer/indexgenerator.cpp   |  347 ++++
 thirdparty/meshoptimizer/meshoptimizer.h      |  948 ++++++++++
 thirdparty/meshoptimizer/overdrawanalyzer.cpp |  230 +++
 .../meshoptimizer/overdrawoptimizer.cpp       |  333 ++++
 thirdparty/meshoptimizer/simplifier.cpp       | 1529 +++++++++++++++++
 thirdparty/meshoptimizer/spatialorder.cpp     |  194 +++
 thirdparty/meshoptimizer/stripifier.cpp       |  295 ++++
 thirdparty/meshoptimizer/vcacheanalyzer.cpp   |   73 +
 thirdparty/meshoptimizer/vcacheoptimizer.cpp  |  473 +++++
 thirdparty/meshoptimizer/vertexcodec.cpp      | 1265 ++++++++++++++
 thirdparty/meshoptimizer/vertexfilter.cpp     |  825 +++++++++
 thirdparty/meshoptimizer/vfetchanalyzer.cpp   |   58 +
 thirdparty/meshoptimizer/vfetchoptimizer.cpp  |   74 +
 33 files changed, 8600 insertions(+), 241 deletions(-)
 create mode 100644 modules/meshoptimizer/SCsub
 create mode 100644 modules/meshoptimizer/config.py
 create mode 100644 modules/meshoptimizer/register_types.cpp
 create mode 100644 modules/meshoptimizer/register_types.h
 create mode 100644 thirdparty/meshoptimizer/LICENSE.md
 create mode 100644 thirdparty/meshoptimizer/allocator.cpp
 create mode 100644 thirdparty/meshoptimizer/clusterizer.cpp
 create mode 100644 thirdparty/meshoptimizer/indexcodec.cpp
 create mode 100644 thirdparty/meshoptimizer/indexgenerator.cpp
 create mode 100644 thirdparty/meshoptimizer/meshoptimizer.h
 create mode 100644 thirdparty/meshoptimizer/overdrawanalyzer.cpp
 create mode 100644 thirdparty/meshoptimizer/overdrawoptimizer.cpp
 create mode 100644 thirdparty/meshoptimizer/simplifier.cpp
 create mode 100644 thirdparty/meshoptimizer/spatialorder.cpp
 create mode 100644 thirdparty/meshoptimizer/stripifier.cpp
 create mode 100644 thirdparty/meshoptimizer/vcacheanalyzer.cpp
 create mode 100644 thirdparty/meshoptimizer/vcacheoptimizer.cpp
 create mode 100644 thirdparty/meshoptimizer/vertexcodec.cpp
 create mode 100644 thirdparty/meshoptimizer/vertexfilter.cpp
 create mode 100644 thirdparty/meshoptimizer/vfetchanalyzer.cpp
 create mode 100644 thirdparty/meshoptimizer/vfetchoptimizer.cpp

diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index 66c54c4267..8cdb865937 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -3654,6 +3654,9 @@ void EditorNode::register_editor_types() {
 	ClassDB::register_class<ScriptCreateDialog>();
 	ClassDB::register_class<EditorFeatureProfile>();
 	ClassDB::register_class<EditorSpinSlider>();
+	ClassDB::register_class<EditorSceneImporterMesh>();
+	ClassDB::register_class<EditorSceneImporterMeshNode>();
+
 	ClassDB::register_virtual_class<FileSystemDock>();
 
 	// FIXME: Is this stuff obsolete, or should it be ported to new APIs?
diff --git a/editor/import/editor_import_collada.cpp b/editor/import/editor_import_collada.cpp
index 270bdc3821..4e93fe6f12 100644
--- a/editor/import/editor_import_collada.cpp
+++ b/editor/import/editor_import_collada.cpp
@@ -67,7 +67,7 @@ struct ColladaImport {
 
 	Map<String, NodeMap> node_map; //map from collada node to engine node
 	Map<String, String> node_name_map; //map from collada node to engine node
-	Map<String, Ref<ArrayMesh>> mesh_cache;
+	Map<String, Ref<EditorSceneImporterMesh>> mesh_cache;
 	Map<String, Ref<Curve3D>> curve_cache;
 	Map<String, Ref<Material>> material_cache;
 	Map<Collada::Node *, Skeleton3D *> skeleton_map;
@@ -83,7 +83,7 @@ struct ColladaImport {
 	Error _create_scene(Collada::Node *p_node, Node3D *p_parent);
 	Error _create_resources(Collada::Node *p_node, bool p_use_compression);
 	Error _create_material(const String &p_target);
-	Error _create_mesh_surfaces(bool p_optimize, Ref<ArrayMesh> &p_mesh, const Map<String, Collada::NodeGeometry::Material> &p_material_map, const Collada::MeshData &meshdata, const Transform &p_local_xform, const Vector<int> &bone_remap, const Collada::SkinControllerData *p_skin_controller, const Collada::MorphControllerData *p_morph_data, Vector<Ref<ArrayMesh>> p_morph_meshes = Vector<Ref<ArrayMesh>>(), bool p_use_compression = false, bool p_use_mesh_material = false);
+	Error _create_mesh_surfaces(bool p_optimize, Ref<EditorSceneImporterMesh> &p_mesh, const Map<String, Collada::NodeGeometry::Material> &p_material_map, const Collada::MeshData &meshdata, const Transform &p_local_xform, const Vector<int> &bone_remap, const Collada::SkinControllerData *p_skin_controller, const Collada::MorphControllerData *p_morph_data, Vector<Ref<EditorSceneImporterMesh>> p_morph_meshes = Vector<Ref<EditorSceneImporterMesh>>(), bool p_use_compression = false, bool p_use_mesh_material = false);
 	Error load(const String &p_path, int p_flags, bool p_force_make_tangents = false, bool p_use_compression = false);
 	void _fix_param_animation_tracks();
 	void create_animation(int p_clip, bool p_make_tracks_in_all_bones, bool p_import_value_tracks);
@@ -278,8 +278,8 @@ Error ColladaImport::_create_scene(Collada::Node *p_node, Node3D *p_parent) {
 				node = memnew(Path3D);
 			} else {
 				//mesh since nothing else
-				node = memnew(MeshInstance3D);
-				//Object::cast_to<MeshInstance3D>(node)->set_flag(GeometryInstance3D::FLAG_USE_BAKED_LIGHT, true);
+				node = memnew(EditorSceneImporterMeshNode);
+				//Object::cast_to<EditorSceneImporterMeshNode>(node)->set_flag(GeometryInstance3D::FLAG_USE_BAKED_LIGHT, true);
 			}
 		} break;
 		case Collada::Node::TYPE_SKELETON: {
@@ -440,7 +440,7 @@ Error ColladaImport::_create_material(const String &p_target) {
 	return OK;
 }
 
-Error ColladaImport::_create_mesh_surfaces(bool p_optimize, Ref<ArrayMesh> &p_mesh, const Map<String, Collada::NodeGeometry::Material> &p_material_map, const Collada::MeshData &meshdata, const Transform &p_local_xform, const Vector<int> &bone_remap, const Collada::SkinControllerData *p_skin_controller, const Collada::MorphControllerData *p_morph_data, Vector<Ref<ArrayMesh>> p_morph_meshes, bool p_use_compression, bool p_use_mesh_material) {
+Error ColladaImport::_create_mesh_surfaces(bool p_optimize, Ref<EditorSceneImporterMesh> &p_mesh, const Map<String, Collada::NodeGeometry::Material> &p_material_map, const Collada::MeshData &meshdata, const Transform &p_local_xform, const Vector<int> &bone_remap, const Collada::SkinControllerData *p_skin_controller, const Collada::MorphControllerData *p_morph_data, Vector<Ref<EditorSceneImporterMesh>> p_morph_meshes, bool p_use_compression, bool p_use_mesh_material) {
 	bool local_xform_mirror = p_local_xform.basis.determinant() < 0;
 
 	if (p_morph_data) {
@@ -457,9 +457,9 @@ Error ColladaImport::_create_mesh_surfaces(bool p_optimize, Ref<ArrayMesh> &p_me
 			p_mesh->add_blend_shape(name);
 		}
 		if (p_morph_data->mode == "RELATIVE") {
-			p_mesh->set_blend_shape_mode(ArrayMesh::BLEND_SHAPE_MODE_RELATIVE);
+			p_mesh->set_blend_shape_mode(Mesh::BLEND_SHAPE_MODE_RELATIVE);
 		} else if (p_morph_data->mode == "NORMALIZED") {
-			p_mesh->set_blend_shape_mode(ArrayMesh::BLEND_SHAPE_MODE_NORMALIZED);
+			p_mesh->set_blend_shape_mode(Mesh::BLEND_SHAPE_MODE_NORMALIZED);
 		}
 	}
 
@@ -897,7 +897,7 @@ Error ColladaImport::_create_mesh_surfaces(bool p_optimize, Ref<ArrayMesh> &p_me
 			////////////////////////////
 
 			for (int mi = 0; mi < p_morph_meshes.size(); mi++) {
-				Array a = p_morph_meshes[mi]->surface_get_arrays(surface);
+				Array a = p_morph_meshes[mi]->get_surface_arrays(surface);
 				//add valid weight and bone arrays if they exist, TODO check if they are unique to shape (generally not)
 
 				if (has_weights) {
@@ -910,14 +910,15 @@ Error ColladaImport::_create_mesh_surfaces(bool p_optimize, Ref<ArrayMesh> &p_me
 				mr.push_back(a);
 			}
 
-			p_mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, d, mr, Dictionary(), 0);
-
+			String surface_name;
+			Ref<Material> mat;
 			if (material.is_valid()) {
 				if (p_use_mesh_material) {
-					p_mesh->surface_set_material(surface, material);
+					mat = material;
 				}
-				p_mesh->surface_set_name(surface, material->get_name());
+				surface_name = material->get_name();
 			}
+			p_mesh->add_surface(Mesh::PRIMITIVE_TRIANGLES, d, mr, Dictionary(), mat, surface_name);
 		}
 
 		/*****************/
@@ -1002,10 +1003,10 @@ Error ColladaImport::_create_resources(Collada::Node *p_node, bool p_use_compres
 			}
 		}
 
-		if (Object::cast_to<MeshInstance3D>(node)) {
+		if (Object::cast_to<EditorSceneImporterMeshNode>(node)) {
 			Collada::NodeGeometry *ng2 = static_cast<Collada::NodeGeometry *>(p_node);
 
-			MeshInstance3D *mi = Object::cast_to<MeshInstance3D>(node);
+			EditorSceneImporterMeshNode *mi = Object::cast_to<EditorSceneImporterMeshNode>(node);
 
 			ERR_FAIL_COND_V(!mi, ERR_BUG);
 
@@ -1014,7 +1015,7 @@ Error ColladaImport::_create_resources(Collada::Node *p_node, bool p_use_compres
 			String meshid;
 			Transform apply_xform;
 			Vector<int> bone_remap;
-			Vector<Ref<ArrayMesh>> morphs;
+			Vector<Ref<EditorSceneImporterMesh>> morphs;
 
 			if (ng2->controller) {
 				String ngsource = ng2->source;
@@ -1083,10 +1084,10 @@ Error ColladaImport::_create_resources(Collada::Node *p_node, bool p_use_compres
 							for (int i = 0; i < names.size(); i++) {
 								String meshid2 = names[i];
 								if (collada.state.mesh_data_map.has(meshid2)) {
-									Ref<ArrayMesh> mesh = Ref<ArrayMesh>(memnew(ArrayMesh));
+									Ref<EditorSceneImporterMesh> mesh = Ref<EditorSceneImporterMesh>(memnew(EditorSceneImporterMesh));
 									const Collada::MeshData &meshdata = collada.state.mesh_data_map[meshid2];
 									mesh->set_name(meshdata.name);
-									Error err = _create_mesh_surfaces(false, mesh, ng2->material_map, meshdata, apply_xform, bone_remap, skin, nullptr, Vector<Ref<ArrayMesh>>(), false);
+									Error err = _create_mesh_surfaces(false, mesh, ng2->material_map, meshdata, apply_xform, bone_remap, skin, nullptr, Vector<Ref<EditorSceneImporterMesh>>(), false);
 									ERR_FAIL_COND_V(err, err);
 
 									morphs.push_back(mesh);
@@ -1109,7 +1110,7 @@ Error ColladaImport::_create_resources(Collada::Node *p_node, bool p_use_compres
 				meshid = ng2->source;
 			}
 
-			Ref<ArrayMesh> mesh;
+			Ref<EditorSceneImporterMesh> mesh;
 			if (mesh_cache.has(meshid)) {
 				mesh = mesh_cache[meshid];
 			} else {
@@ -1117,7 +1118,7 @@ Error ColladaImport::_create_resources(Collada::Node *p_node, bool p_use_compres
 					//bleh, must ignore invalid
 
 					ERR_FAIL_COND_V(!collada.state.mesh_data_map.has(meshid), ERR_INVALID_DATA);
-					mesh = Ref<ArrayMesh>(memnew(ArrayMesh));
+					mesh = Ref<EditorSceneImporterMesh>(memnew(EditorSceneImporterMesh));
 					const Collada::MeshData &meshdata = collada.state.mesh_data_map[meshid];
 					mesh->set_name(meshdata.name);
 					Error err = _create_mesh_surfaces(morphs.size() == 0, mesh, ng2->material_map, meshdata, apply_xform, bone_remap, skin, morph, morphs, p_use_compression, use_mesh_builtin_materials);
diff --git a/editor/import/editor_scene_importer_gltf.cpp b/editor/import/editor_scene_importer_gltf.cpp
index ac76f67ef9..1059692ca0 100644
--- a/editor/import/editor_scene_importer_gltf.cpp
+++ b/editor/import/editor_scene_importer_gltf.cpp
@@ -970,8 +970,6 @@ Error EditorSceneImporterGLTF::_parse_meshes(GLTFState &state) {
 		return OK;
 	}
 
-	uint32_t mesh_flags = 0;
-
 	Array meshes = state.json["meshes"];
 	for (GLTFMeshIndex i = 0; i < meshes.size(); i++) {
 		print_verbose("glTF: Parsing mesh: " + itos(i));
@@ -979,6 +977,7 @@ Error EditorSceneImporterGLTF::_parse_meshes(GLTFState &state) {
 
 		GLTFMesh mesh;
 		mesh.mesh.instance();
+		bool has_vertex_color = false;
 
 		ERR_FAIL_COND_V(!d.has("primitives"), ERR_PARSE_ERROR);
 
@@ -1034,6 +1033,7 @@ Error EditorSceneImporterGLTF::_parse_meshes(GLTFState &state) {
 			}
 			if (a.has("COLOR_0")) {
 				array[Mesh::ARRAY_COLOR] = _decode_accessor_as_color(state, a["COLOR_0"], true);
+				has_vertex_color = true;
 			}
 			if (a.has("JOINTS_0")) {
 				array[Mesh::ARRAY_BONES] = _decode_accessor_as_ints(state, a["JOINTS_0"], true);
@@ -1112,7 +1112,7 @@ Error EditorSceneImporterGLTF::_parse_meshes(GLTFState &state) {
 
 				//ideally BLEND_SHAPE_MODE_RELATIVE since gltf2 stores in displacement
 				//but it could require a larger refactor?
-				mesh.mesh->set_blend_shape_mode(ArrayMesh::BLEND_SHAPE_MODE_NORMALIZED);
+				mesh.mesh->set_blend_shape_mode(Mesh::BLEND_SHAPE_MODE_NORMALIZED);
 
 				if (j == 0) {
 					const Array &target_names = extras.has("targetNames") ? (Array)extras["targetNames"] : Array();
@@ -1226,21 +1226,25 @@ Error EditorSceneImporterGLTF::_parse_meshes(GLTFState &state) {
 			}
 
 			//just add it
-			mesh.mesh->add_surface_from_arrays(primitive, array, morphs, Dictionary(), mesh_flags);
 
+			Ref<Material> mat;
 			if (p.has("material")) {
 				const int material = p["material"];
 				ERR_FAIL_INDEX_V(material, state.materials.size(), ERR_FILE_CORRUPT);
-				const Ref<Material> &mat = state.materials[material];
+				Ref<StandardMaterial3D> mat3d = state.materials[material];
+				if (has_vertex_color) {
+					mat3d->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
+				}
+				mat = mat3d;
 
-				mesh.mesh->surface_set_material(mesh.mesh->get_surface_count() - 1, mat);
-			} else {
-				Ref<StandardMaterial3D> mat;
-				mat.instance();
-				mat->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
-
-				mesh.mesh->surface_set_material(mesh.mesh->get_surface_count() - 1, mat);
+			} else if (has_vertex_color) {
+				Ref<StandardMaterial3D> mat3d;
+				mat3d.instance();
+				mat3d->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
+				mat = mat3d;
 			}
+
+			mesh.mesh->add_surface(primitive, array, morphs, Dictionary(), mat);
 		}
 
 		mesh.blend_weights.resize(mesh.mesh->get_blend_shape_count());
@@ -1440,7 +1444,8 @@ Error EditorSceneImporterGLTF::_parse_materials(GLTFState &state) {
 		if (d.has("name")) {
 			material->set_name(d["name"]);
 		}
-		material->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
+		//don't do this here only if vertex color exists
+		//material->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
 
 		if (d.has("pbrMetallicRoughness")) {
 			const Dictionary &mr = d["pbrMetallicRoughness"];
@@ -2586,12 +2591,12 @@ BoneAttachment3D *EditorSceneImporterGLTF::_generate_bone_attachment(GLTFState &
 	return bone_attachment;
 }
 
-MeshInstance3D *EditorSceneImporterGLTF::_generate_mesh_instance(GLTFState &state, Node *scene_parent, const GLTFNodeIndex node_index) {
+EditorSceneImporterMeshNode *EditorSceneImporterGLTF::_generate_mesh_instance(GLTFState &state, Node *scene_parent, const GLTFNodeIndex node_index) {
 	const GLTFNode *gltf_node = state.nodes[node_index];
 
 	ERR_FAIL_INDEX_V(gltf_node->mesh, state.meshes.size(), nullptr);
 
-	MeshInstance3D *mi = memnew(MeshInstance3D);
+	EditorSceneImporterMeshNode *mi = memnew(EditorSceneImporterMeshNode);
 	print_verbose("glTF: Creating mesh for: " + gltf_node->name);
 
 	GLTFMesh &mesh = state.meshes.write[gltf_node->mesh];
@@ -3058,7 +3063,7 @@ void EditorSceneImporterGLTF::_process_mesh_instances(GLTFState &state, Node3D *
 			const GLTFSkinIndex skin_i = node->skin;
 
 			Map<GLTFNodeIndex, Node *>::Element *mi_element = state.scene_nodes.find(node_i);
-			MeshInstance3D *mi = Object::cast_to<MeshInstance3D>(mi_element->get());
+			EditorSceneImporterMeshNode *mi = Object::cast_to<EditorSceneImporterMeshNode>(mi_element->get());
 			ERR_FAIL_COND(mi == nullptr);
 
 			const GLTFSkeletonIndex skel_i = state.skins[node->skin].skeleton;
diff --git a/editor/import/editor_scene_importer_gltf.h b/editor/import/editor_scene_importer_gltf.h
index 5ea8bf17b8..6390f46524 100644
--- a/editor/import/editor_scene_importer_gltf.h
+++ b/editor/import/editor_scene_importer_gltf.h
@@ -38,7 +38,7 @@
 
 class AnimationPlayer;
 class BoneAttachment3D;
-class MeshInstance3D;
+class EditorSceneImporterMeshNode;
 
 class EditorSceneImporterGLTF : public EditorSceneImporter {
 	GDCLASS(EditorSceneImporterGLTF, EditorSceneImporter);
@@ -199,7 +199,7 @@ class EditorSceneImporterGLTF : public EditorSceneImporter {
 	};
 
 	struct GLTFMesh {
-		Ref<ArrayMesh> mesh;
+		Ref<EditorSceneImporterMesh> mesh;
 		Vector<float> blend_weights;
 	};
 
@@ -262,7 +262,7 @@ class EditorSceneImporterGLTF : public EditorSceneImporter {
 		Vector<GLTFAccessor> accessors;
 
 		Vector<GLTFMesh> meshes; //meshes are loaded directly, no reason not to.
-		Vector<Ref<Material>> materials;
+		Vector<Ref<StandardMaterial3D>> materials;
 
 		String scene_name;
 		Vector<int> root_nodes;
@@ -355,7 +355,7 @@ class EditorSceneImporterGLTF : public EditorSceneImporter {
 	Error _parse_animations(GLTFState &state);
 
 	BoneAttachment3D *_generate_bone_attachment(GLTFState &state, Skeleton3D *skeleton, const GLTFNodeIndex node_index);
-	MeshInstance3D *_generate_mesh_instance(GLTFState &state, Node *scene_parent, const GLTFNodeIndex node_index);
+	EditorSceneImporterMeshNode *_generate_mesh_instance(GLTFState &state, Node *scene_parent, const GLTFNodeIndex node_index);
 	Camera3D *_generate_camera(GLTFState &state, Node *scene_parent, const GLTFNodeIndex node_index);
 	Light3D *_generate_light(GLTFState &state, Node *scene_parent, const GLTFNodeIndex node_index);
 	Node3D *_generate_spatial(GLTFState &state, Node *scene_parent, const GLTFNodeIndex node_index);
diff --git a/editor/import/resource_importer_obj.cpp b/editor/import/resource_importer_obj.cpp
index d4560a2984..30c7b2920a 100644
--- a/editor/import/resource_importer_obj.cpp
+++ b/editor/import/resource_importer_obj.cpp
@@ -225,6 +225,8 @@ static Error _parse_obj(const String &p_path, List<Ref<Mesh>> &r_meshes, bool p_
 	String current_material_library;
 	String current_material;
 	String current_group;
+	uint32_t smooth_group = 0;
+	bool smoothing = true;
 
 	while (true) {
 		String l = f->get_line().strip_edges();
@@ -315,6 +317,10 @@ static Error _parse_obj(const String &p_path, List<Ref<Mesh>> &r_meshes, bool p_
 					Vector3 vertex = vertices[vtx];
 					//if (weld_vertices)
 					//	vertex.snap(Vector3(weld_tolerance, weld_tolerance, weld_tolerance));
+					if (!smoothing) {
+						smooth_group++;
+					}
+					surf_tool->set_smooth_group(smooth_group);
 					surf_tool->add_vertex(vertex);
 				}
 
@@ -322,10 +328,15 @@ static Error _parse_obj(const String &p_path, List<Ref<Mesh>> &r_meshes, bool p_
 			}
 		} else if (l.begins_with("s ")) { //smoothing
 			String what = l.substr(2, l.length()).strip_edges();
+			bool do_smooth;
 			if (what == "off") {
-				surf_tool->add_smooth_group(false);
+				do_smooth = false;
 			} else {
-				surf_tool->add_smooth_group(true);
+				do_smooth = true;
+			}
+			if (do_smooth != smoothing) {
+				smooth_group++;
+				smoothing = do_smooth;
 			}
 		} else if (/*l.begins_with("g ") ||*/ l.begins_with("usemtl ") || (l.begins_with("o ") || f->eof_reached())) { //commit group to mesh
 			//groups are too annoying
@@ -426,8 +437,15 @@ Node *EditorOBJImporter::import_scene(const String &p_path, uint32_t p_flags, in
 	Node3D *scene = memnew(Node3D);
 
 	for (List<Ref<Mesh>>::Element *E = meshes.front(); E; E = E->next()) {
-		MeshInstance3D *mi = memnew(MeshInstance3D);
-		mi->set_mesh(E->get());
+		Ref<EditorSceneImporterMesh> mesh;
+		mesh.instance();
+		Ref<Mesh> m = E->get();
+		for (int i = 0; i < m->get_surface_count(); i++) {
+			mesh->add_surface(m->surface_get_primitive_type(i), m->surface_get_arrays(i), Array(), Dictionary(), m->surface_get_material(i));
+		}
+
+		EditorSceneImporterMeshNode *mi = memnew(EditorSceneImporterMeshNode);
+		mi->set_mesh(mesh);
 		mi->set_name(E->get()->get_name());
 		scene->add_child(mi);
 		mi->set_owner(scene);
diff --git a/editor/import/resource_importer_scene.cpp b/editor/import/resource_importer_scene.cpp
index fc4f673ec4..5abae339df 100644
--- a/editor/import/resource_importer_scene.cpp
+++ b/editor/import/resource_importer_scene.cpp
@@ -119,6 +119,304 @@ void EditorSceneImporter::_bind_methods() {
 	BIND_CONSTANT(IMPORT_USE_COMPRESSION);
 }
 
+////////////////////////////////////////////////
+
+void EditorSceneImporterMesh::add_blend_shape(const String &p_name) {
+	ERR_FAIL_COND(surfaces.size() > 0);
+	blend_shapes.push_back(p_name);
+}
+
+int EditorSceneImporterMesh::get_blend_shape_count() const {
+	return blend_shapes.size();
+}
+
+String EditorSceneImporterMesh::get_blend_shape_name(int p_blend_shape) const {
+	ERR_FAIL_INDEX_V(p_blend_shape, blend_shapes.size(), String());
+	return blend_shapes[p_blend_shape];
+}
+
+void EditorSceneImporterMesh::set_blend_shape_mode(Mesh::BlendShapeMode p_blend_shape_mode) {
+	blend_shape_mode = p_blend_shape_mode;
+}
+Mesh::BlendShapeMode EditorSceneImporterMesh::get_blend_shape_mode() const {
+	return blend_shape_mode;
+}
+
+void EditorSceneImporterMesh::add_surface(Mesh::PrimitiveType p_primitive, const Array &p_arrays, const Array &p_blend_shapes, const Dictionary &p_lods, const Ref<Material> &p_material, const String &p_name) {
+	ERR_FAIL_COND(p_blend_shapes.size() != blend_shapes.size());
+	ERR_FAIL_COND(p_arrays.size() != Mesh::ARRAY_MAX);
+	Surface s;
+	s.primitive = p_primitive;
+	s.arrays = p_arrays;
+	s.name = p_name;
+
+	for (int i = 0; i < blend_shapes.size(); i++) {
+		Array bsdata = p_blend_shapes[i];
+		ERR_FAIL_COND(bsdata.size() != Mesh::ARRAY_MAX);
+		Surface::BlendShape bs;
+		bs.arrays = bsdata;
+		s.blend_shape_data.push_back(bs);
+	}
+
+	List<Variant> lods;
+	p_lods.get_key_list(&lods);
+	for (List<Variant>::Element *E = lods.front(); E; E = E->next()) {
+		ERR_CONTINUE(!E->get().is_num());
+		Surface::LOD lod;
+		lod.distance = E->get();
+		lod.indices = p_lods[E->get()];
+		ERR_CONTINUE(lod.indices.size() == 0);
+		s.lods.push_back(lod);
+	}
+
+	s.material = p_material;
+
+	surfaces.push_back(s);
+	mesh.unref();
+}
+int EditorSceneImporterMesh::get_surface_count() const {
+	return surfaces.size();
+}
+
+Mesh::PrimitiveType EditorSceneImporterMesh::get_surface_primitive_type(int p_surface) {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), Mesh::PRIMITIVE_MAX);
+	return surfaces[p_surface].primitive;
+}
+Array EditorSceneImporterMesh::get_surface_arrays(int p_surface) const {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), Array());
+	return surfaces[p_surface].arrays;
+}
+String EditorSceneImporterMesh::get_surface_name(int p_surface) const {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), String());
+	return surfaces[p_surface].name;
+}
+Array EditorSceneImporterMesh::get_surface_blend_shape_arrays(int p_surface, int p_blend_shape) const {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), Array());
+	ERR_FAIL_INDEX_V(p_blend_shape, surfaces[p_surface].blend_shape_data.size(), Array());
+	return surfaces[p_surface].blend_shape_data[p_blend_shape].arrays;
+}
+int EditorSceneImporterMesh::get_surface_lod_count(int p_surface) const {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), 0);
+	return surfaces[p_surface].lods.size();
+}
+Vector<int> EditorSceneImporterMesh::get_surface_lod_indices(int p_surface, int p_lod) const {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), Vector<int>());
+	ERR_FAIL_INDEX_V(p_lod, surfaces[p_surface].lods.size(), Vector<int>());
+
+	return surfaces[p_surface].lods[p_lod].indices;
+}
+
+float EditorSceneImporterMesh::get_surface_lod_size(int p_surface, int p_lod) const {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), 0);
+	ERR_FAIL_INDEX_V(p_lod, surfaces[p_surface].lods.size(), 0);
+	return surfaces[p_surface].lods[p_lod].distance;
+}
+
+Ref<Material> EditorSceneImporterMesh::get_surface_material(int p_surface) const {
+	ERR_FAIL_INDEX_V(p_surface, surfaces.size(), Ref<Material>());
+	return surfaces[p_surface].material;
+}
+
+bool EditorSceneImporterMesh::has_mesh() const {
+	return mesh.is_valid();
+}
+
+Ref<ArrayMesh> EditorSceneImporterMesh::get_mesh() {
+	ERR_FAIL_COND_V(surfaces.size() == 0, Ref<ArrayMesh>());
+
+	if (mesh.is_null()) {
+		mesh.instance();
+		for (int i = 0; i < blend_shapes.size(); i++) {
+			mesh->add_blend_shape(blend_shapes[i]);
+		}
+		mesh->set_blend_shape_mode(blend_shape_mode);
+		for (int i = 0; i < surfaces.size(); i++) {
+			Array bs_data;
+			if (surfaces[i].blend_shape_data.size()) {
+				for (int j = 0; j < surfaces[i].blend_shape_data.size(); j++) {
+					bs_data.push_back(surfaces[i].blend_shape_data[j].arrays);
+				}
+			}
+			Dictionary lods;
+			if (surfaces[i].lods.size()) {
+				for (int j = 0; j < surfaces[i].lods.size(); j++) {
+					lods[surfaces[i].lods[j].distance] = surfaces[i].lods[j].indices;
+				}
+			}
+
+			mesh->add_surface_from_arrays(surfaces[i].primitive, surfaces[i].arrays, bs_data, lods);
+			if (surfaces[i].material.is_valid()) {
+				mesh->surface_set_material(mesh->get_surface_count() - 1, surfaces[i].material);
+			}
+			if (surfaces[i].name != String()) {
+				mesh->surface_set_name(mesh->get_surface_count() - 1, surfaces[i].name);
+			}
+		}
+	}
+
+	return mesh;
+}
+
+void EditorSceneImporterMesh::clear() {
+	surfaces.clear();
+	blend_shapes.clear();
+	mesh.unref();
+}
+
+void EditorSceneImporterMesh::_set_data(const Dictionary &p_data) {
+	clear();
+	if (p_data.has("blend_shape_names")) {
+		blend_shapes = p_data["blend_shape_names"];
+	}
+	if (p_data.has("surfaces")) {
+		Array surface_arr = p_data["surfaces"];
+		for (int i = 0; i < surface_arr.size(); i++) {
+			Dictionary s = surface_arr[i];
+			ERR_CONTINUE(!s.has("primitive"));
+			ERR_CONTINUE(!s.has("arrays"));
+			Mesh::PrimitiveType prim = Mesh::PrimitiveType(int(s["primitive"]));
+			ERR_CONTINUE(prim >= Mesh::PRIMITIVE_MAX);
+			Array arr = s["arrays"];
+			Dictionary lods;
+			String name;
+			if (s.has("name")) {
+				name = s["name"];
+			}
+			if (s.has("lods")) {
+				lods = s["lods"];
+			}
+			Array blend_shapes;
+			if (s.has("blend_shapes")) {
+				blend_shapes = s["blend_shapes"];
+			}
+			Ref<Material> material;
+			if (s.has("material")) {
+				material = s["material"];
+			}
+			add_surface(prim, arr, blend_shapes, lods, material, name);
+		}
+	}
+}
+Dictionary EditorSceneImporterMesh::_get_data() const {
+	Dictionary data;
+	if (blend_shapes.size()) {
+		data["blend_shape_names"] = blend_shapes;
+	}
+	Array surface_arr;
+	for (int i = 0; i < surfaces.size(); i++) {
+		Dictionary d;
+		d["primitive"] = surfaces[i].primitive;
+		d["arrays"] = surfaces[i].arrays;
+		if (surfaces[i].blend_shape_data.size()) {
+			Array bs_data;
+			for (int j = 0; j < surfaces[i].blend_shape_data.size(); j++) {
+				bs_data.push_back(surfaces[i].blend_shape_data[j].arrays);
+			}
+			d["blend_shapes"] = bs_data;
+		}
+		if (surfaces[i].lods.size()) {
+			Dictionary lods;
+			for (int j = 0; j < surfaces[i].lods.size(); j++) {
+				lods[surfaces[i].lods[j].distance] = surfaces[i].lods[j].indices;
+			}
+			d["lods"] = lods;
+		}
+
+		if (surfaces[i].material.is_valid()) {
+			d["material"] = surfaces[i].material;
+		}
+
+		if (surfaces[i].name != String()) {
+			d["name"] = surfaces[i].name;
+		}
+
+		surface_arr.push_back(d);
+	}
+	data["surfaces"] = surface_arr;
+	return data;
+}
+
+void EditorSceneImporterMesh::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("add_blend_shape", "name"), &EditorSceneImporterMesh::add_blend_shape);
+	ClassDB::bind_method(D_METHOD("get_blend_shape_count"), &EditorSceneImporterMesh::get_blend_shape_count);
+	ClassDB::bind_method(D_METHOD("get_blend_shape_name", "blend_shape_idx"), &EditorSceneImporterMesh::get_blend_shape_name);
+
+	ClassDB::bind_method(D_METHOD("set_blend_shape_mode", "mode"), &EditorSceneImporterMesh::set_blend_shape_mode);
+	ClassDB::bind_method(D_METHOD("get_blend_shape_mode"), &EditorSceneImporterMesh::get_blend_shape_mode);
+
+	ClassDB::bind_method(D_METHOD("add_surface", "primitive", "arrays", "blend_shapes", "lods", "material"), &EditorSceneImporterMesh::add_surface, DEFVAL(Array()), DEFVAL(Dictionary()), DEFVAL(Ref<Material>()), DEFVAL(String()));
+
+	ClassDB::bind_method(D_METHOD("get_surface_count"), &EditorSceneImporterMesh::get_surface_count);
+	ClassDB::bind_method(D_METHOD("get_surface_primitive_type", "surface_idx"), &EditorSceneImporterMesh::get_surface_primitive_type);
+	ClassDB::bind_method(D_METHOD("get_surface_name", "surface_idx"), &EditorSceneImporterMesh::get_surface_name);
+	ClassDB::bind_method(D_METHOD("get_surface_arrays", "surface_idx"), &EditorSceneImporterMesh::get_surface_arrays);
+	ClassDB::bind_method(D_METHOD("get_surface_blend_shape_arrays", "surface_idx", "blend_shape_idx"), &EditorSceneImporterMesh::get_surface_blend_shape_arrays);
+	ClassDB::bind_method(D_METHOD("get_surface_lod_count", "surface_idx"), &EditorSceneImporterMesh::get_surface_lod_count);
+	ClassDB::bind_method(D_METHOD("get_surface_lod_size", "surface_idx", "lod_idx"), &EditorSceneImporterMesh::get_surface_lod_size);
+	ClassDB::bind_method(D_METHOD("get_surface_lod_indices", "surface_idx", "lod_idx"), &EditorSceneImporterMesh::get_surface_lod_indices);
+	ClassDB::bind_method(D_METHOD("get_surface_material", "surface_idx"), &EditorSceneImporterMesh::get_surface_material);
+
+	ClassDB::bind_method(D_METHOD("get_mesh"), &EditorSceneImporterMesh::get_mesh);
+	ClassDB::bind_method(D_METHOD("clear"), &EditorSceneImporterMesh::clear);
+
+	ClassDB::bind_method(D_METHOD("_set_data", "data"), &EditorSceneImporterMesh::_set_data);
+	ClassDB::bind_method(D_METHOD("_get_data"), &EditorSceneImporterMesh::_get_data);
+
+	ADD_PROPERTY(PropertyInfo(Variant::DICTIONARY, "_data", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "_set_data", "_get_data");
+}
+
+void EditorSceneImporterMeshNode::set_mesh(const Ref<EditorSceneImporterMesh> &p_mesh) {
+	mesh = p_mesh;
+}
+Ref<EditorSceneImporterMesh> EditorSceneImporterMeshNode::get_mesh() const {
+	return mesh;
+}
+
+void EditorSceneImporterMeshNode::set_skin(const Ref<Skin> &p_skin) {
+	skin = p_skin;
+}
+Ref<Skin> EditorSceneImporterMeshNode::get_skin() const {
+	return skin;
+}
+
+void EditorSceneImporterMeshNode::set_surface_material(int p_idx, const Ref<Material> &p_material) {
+	ERR_FAIL_COND(p_idx < 0);
+	if (p_idx >= surface_materials.size()) {
+		surface_materials.resize(p_idx + 1);
+	}
+
+	surface_materials.write[p_idx] = p_material;
+}
+Ref<Material> EditorSceneImporterMeshNode::get_surface_material(int p_idx) const {
+	ERR_FAIL_COND_V(p_idx < 0, Ref<Material>());
+	if (p_idx >= surface_materials.size()) {
+		return Ref<Material>();
+	}
+	return surface_materials[p_idx];
+}
+
+void EditorSceneImporterMeshNode::set_skeleton_path(const NodePath &p_path) {
+	skeleton_path = p_path;
+}
+NodePath EditorSceneImporterMeshNode::get_skeleton_path() const {
+	return skeleton_path;
+}
+
+void EditorSceneImporterMeshNode::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_mesh", "mesh"), &EditorSceneImporterMeshNode::set_mesh);
+	ClassDB::bind_method(D_METHOD("get_mesh"), &EditorSceneImporterMeshNode::get_mesh);
+
+	ClassDB::bind_method(D_METHOD("set_skin", "skin"), &EditorSceneImporterMeshNode::set_skin);
+	ClassDB::bind_method(D_METHOD("get_skin"), &EditorSceneImporterMeshNode::get_skin);
+
+	ClassDB::bind_method(D_METHOD("set_skeleton_path", "skeleton_path"), &EditorSceneImporterMeshNode::set_skeleton_path);
+	ClassDB::bind_method(D_METHOD("get_skeleton_path"), &EditorSceneImporterMeshNode::get_skeleton_path);
+
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "mesh", PROPERTY_HINT_RESOURCE_TYPE, "EditorSceneImporterMesh"), "set_mesh", "get_mesh");
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "skin", PROPERTY_HINT_RESOURCE_TYPE, "Skin"), "set_skin", "get_skin");
+	ADD_PROPERTY(PropertyInfo(Variant::NODE_PATH, "skeleton_path", PROPERTY_HINT_NODE_PATH_VALID_TYPES, "Skeleton"), "set_skeleton_path", "get_skeleton_path");
+}
+
 /////////////////////////////////
 void EditorScenePostImport::_bind_methods() {
 	BIND_VMETHOD(MethodInfo(Variant::OBJECT, "post_import", PropertyInfo(Variant::OBJECT, "scene")));
@@ -1219,6 +1517,34 @@ Ref<Animation> ResourceImporterScene::import_animation_from_other_importer(Edito
 	return importer->import_animation(p_path, p_flags, p_bake_fps);
 }
 
+void ResourceImporterScene::_generate_meshes(Node *p_node) {
+	EditorSceneImporterMeshNode *src_mesh = Object::cast_to<EditorSceneImporterMeshNode>(p_node);
+	if (src_mesh != nullptr) {
+		//is mesh
+		MeshInstance3D *mesh_node = memnew(MeshInstance3D);
+		mesh_node->set_transform(src_mesh->get_transform());
+		mesh_node->set_skin(src_mesh->get_skin());
+		mesh_node->set_skeleton_path(src_mesh->get_skeleton_path());
+
+		Ref<ArrayMesh> mesh;
+		if (!src_mesh->get_mesh()->has_mesh()) {
+			//do mesh processing
+		}
+		mesh = src_mesh->get_mesh()->get_mesh();
+		mesh_node->set_mesh(mesh);
+		for (int i = 0; i < mesh->get_surface_count(); i++) {
+			mesh_node->set_surface_material(i, src_mesh->get_surface_material(i));
+		}
+
+		p_node->replace_by(mesh_node);
+		memdelete(p_node);
+		p_node = mesh_node;
+	}
+
+	for (int i = 0; i < p_node->get_child_count(); i++) {
+		_generate_meshes(p_node->get_child(i));
+	}
+}
 Error ResourceImporterScene::import(const String &p_source_file, const String &p_save_path, const Map<StringName, Variant> &p_options, List<String> *r_platform_variants, List<String> *r_gen_files, Variant *r_metadata) {
 	const String &src_path = p_source_file;
 
@@ -1315,6 +1641,8 @@ Error ResourceImporterScene::import(const String &p_source_file, const String &p
 		scene->set_name(p_save_path.get_file().get_basename());
 	}
 
+	_generate_meshes(scene);
+
 	err = OK;
 
 	String animation_filter = String(p_options["animation/filter_script"]).strip_edges();
diff --git a/editor/import/resource_importer_scene.h b/editor/import/resource_importer_scene.h
index cd61ec01f2..758390b367 100644
--- a/editor/import/resource_importer_scene.h
+++ b/editor/import/resource_importer_scene.h
@@ -32,9 +32,11 @@
 #define RESOURCEIMPORTERSCENE_H
 
 #include "core/io/resource_importer.h"
+#include "scene/3d/node_3d.h"
 #include "scene/resources/animation.h"
 #include "scene/resources/mesh.h"
 #include "scene/resources/shape_3d.h"
+#include "scene/resources/skin.h"
 
 class Material;
 
@@ -88,6 +90,90 @@ public:
 	EditorScenePostImport();
 };
 
+// The following classes are used by importers instead of ArrayMesh and MeshInstance3D
+// so the data is not reginstered (hence, quality loss), importing happens faster and
+// its easier to modify before saving
+
+class EditorSceneImporterMesh : public Resource {
+	GDCLASS(EditorSceneImporterMesh, Resource)
+
+	struct Surface {
+		Mesh::PrimitiveType primitive;
+		Array arrays;
+		struct BlendShape {
+			Array arrays;
+		};
+		Vector<BlendShape> blend_shape_data;
+		struct LOD {
+			Vector<int> indices;
+			float distance;
+		};
+		Vector<LOD> lods;
+		Ref<Material> material;
+		String name;
+	};
+	Vector<Surface> surfaces;
+	Vector<String> blend_shapes;
+	Mesh::BlendShapeMode blend_shape_mode = Mesh::BLEND_SHAPE_MODE_NORMALIZED;
+
+	Ref<ArrayMesh> mesh;
+
+protected:
+	void _set_data(const Dictionary &p_data);
+	Dictionary _get_data() const;
+
+	static void _bind_methods();
+
+public:
+	void add_blend_shape(const String &p_name);
+	int get_blend_shape_count() const;
+	String get_blend_shape_name(int p_blend_shape) const;
+
+	void add_surface(Mesh::PrimitiveType p_primitive, const Array &p_arrays, const Array &p_blend_shapes = Array(), const Dictionary &p_lods = Dictionary(), const Ref<Material> &p_material = Ref<Material>(), const String &p_name = String());
+	int get_surface_count() const;
+
+	void set_blend_shape_mode(Mesh::BlendShapeMode p_blend_shape_mode);
+	Mesh::BlendShapeMode get_blend_shape_mode() const;
+
+	Mesh::PrimitiveType get_surface_primitive_type(int p_surface);
+	String get_surface_name(int p_surface) const;
+	Array get_surface_arrays(int p_surface) const;
+	Array get_surface_blend_shape_arrays(int p_surface, int p_blend_shape) const;
+	int get_surface_lod_count(int p_surface) const;
+	Vector<int> get_surface_lod_indices(int p_surface, int p_lod) const;
+	float get_surface_lod_size(int p_surface, int p_lod) const;
+	Ref<Material> get_surface_material(int p_surface) const;
+
+	bool has_mesh() const;
+	Ref<ArrayMesh> get_mesh();
+	void clear();
+};
+
+class EditorSceneImporterMeshNode : public Node3D {
+	GDCLASS(EditorSceneImporterMeshNode, Node3D)
+
+	Ref<EditorSceneImporterMesh> mesh;
+	Ref<Skin> skin;
+	NodePath skeleton_path;
+	Vector<Ref<Material>> surface_materials;
+
+protected:
+	static void _bind_methods();
+
+public:
+	void set_mesh(const Ref<EditorSceneImporterMesh> &p_mesh);
+	Ref<EditorSceneImporterMesh> get_mesh() const;
+
+	void set_skin(const Ref<Skin> &p_skin);
+	Ref<Skin> get_skin() const;
+
+	void set_surface_material(int p_idx, const Ref<Material> &p_material);
+	Ref<Material> get_surface_material(int p_idx) const;
+
+	void set_skeleton_path(const NodePath &p_path);
+	NodePath get_skeleton_path() const;
+};
+
 class ResourceImporterScene : public ResourceImporter {
 	GDCLASS(ResourceImporterScene, ResourceImporter);
 
@@ -119,6 +205,7 @@ class ResourceImporterScene : public ResourceImporter {
 	};
 
 	void _replace_owner(Node *p_node, Node *p_scene, Node *p_new_owner);
+	void _generate_meshes(Node *p_node);
 
 public:
 	static ResourceImporterScene *get_singleton() { return singleton; }
diff --git a/modules/meshoptimizer/SCsub b/modules/meshoptimizer/SCsub
new file mode 100644
index 0000000000..3b1a5f917e
--- /dev/null
+++ b/modules/meshoptimizer/SCsub
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+Import("env")
+Import("env_modules")
+
+env_meshoptimizer = env_modules.Clone()
+
+# Thirdparty source files
+thirdparty_dir = "#thirdparty/meshoptimizer/"
+thirdparty_sources = [
+    "allocator.cpp",
+    "clusterizer.cpp",
+    "indexcodec.cpp",
+    "indexgenerator.cpp",
+    "overdrawanalyzer.cpp",
+    "overdrawoptimizer.cpp",
+    "simplifier.cpp",
+    "spatialorder.cpp",
+    "stripifier.cpp",
+    "vcacheanalyzer.cpp",
+    "vcacheoptimizer.cpp",
+    "vertexcodec.cpp",
+    "vertexfilter.cpp",
+    "vfetchanalyzer.cpp",
+    "vfetchoptimizer.cpp",
+]
+thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
+
+
+env_thirdparty = env_meshoptimizer.Clone()
+env_thirdparty.disable_warnings()
+env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+
+env_modules.add_source_files(env.modules_sources, ["register_types.cpp"])
diff --git a/modules/meshoptimizer/config.py b/modules/meshoptimizer/config.py
new file mode 100644
index 0000000000..82e4e43397
--- /dev/null
+++ b/modules/meshoptimizer/config.py
@@ -0,0 +1,7 @@
+def can_build(env, platform):
+    # Having this on release by default, it's small and a lot of users like to do procedural stuff
+    return True
+
+
+def configure(env):
+    pass
diff --git a/modules/meshoptimizer/register_types.cpp b/modules/meshoptimizer/register_types.cpp
new file mode 100644
index 0000000000..26c8c6ab72
--- /dev/null
+++ b/modules/meshoptimizer/register_types.cpp
@@ -0,0 +1,43 @@
+/*************************************************************************/
+/*  register_types.cpp                                                   */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2020 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2020 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "register_types.h"
+#include "scene/resources/surface_tool.h"
+#include "thirdparty/meshoptimizer/meshoptimizer.h"
+
+void register_meshoptimizer_types() {
+	SurfaceTool::optimize_vertex_cache_func = meshopt_optimizeVertexCache;
+	SurfaceTool::simplify_func = meshopt_simplify;
+}
+
+void unregister_meshoptimizer_types() {
+	SurfaceTool::optimize_vertex_cache_func = nullptr;
+	SurfaceTool::simplify_func = nullptr;
+}
diff --git a/modules/meshoptimizer/register_types.h b/modules/meshoptimizer/register_types.h
new file mode 100644
index 0000000000..42b3a76a85
--- /dev/null
+++ b/modules/meshoptimizer/register_types.h
@@ -0,0 +1,37 @@
+/*************************************************************************/
+/*  register_types.h                                                     */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2020 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2020 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef MESHOPTIMIZER_REGISTER_TYPES_H
+#define MESHOPTIMIZER_REGISTER_TYPES_H
+
+void register_meshoptimizer_types();
+void unregister_meshoptimizer_types();
+
+#endif // PVR_REGISTER_TYPES_H
diff --git a/scene/resources/mesh.cpp b/scene/resources/mesh.cpp
index c6815c8ecc..287ec55560 100644
--- a/scene/resources/mesh.cpp
+++ b/scene/resources/mesh.cpp
@@ -533,6 +533,9 @@ void Mesh::_bind_methods() {
 	BIND_ENUM_CONSTANT(ARRAY_FLAG_USE_2D_VERTICES);
 	BIND_ENUM_CONSTANT(ARRAY_FLAG_USE_DYNAMIC_UPDATE);
 	BIND_ENUM_CONSTANT(ARRAY_FLAG_USE_8_BONE_WEIGHTS);
+
+	BIND_ENUM_CONSTANT(BLEND_SHAPE_MODE_NORMALIZED);
+	BIND_ENUM_CONSTANT(BLEND_SHAPE_MODE_RELATIVE);
 }
 
 void Mesh::clear_cache() const {
@@ -1384,7 +1387,7 @@ bool (*array_mesh_lightmap_unwrap_callback)(float p_texel_size, const float *p_v
 
 struct ArrayMeshLightmapSurface {
 	Ref<Material> material;
-	Vector<SurfaceTool::Vertex> vertices;
+	LocalVector<SurfaceTool::Vertex> vertices;
 	Mesh::PrimitiveType primitive;
 	uint32_t format;
 };
@@ -1425,7 +1428,7 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 
 		Array arrays = surface_get_arrays(i);
 		s.material = surface_get_material(i);
-		s.vertices = SurfaceTool::create_vertex_array_from_triangle_arrays(arrays);
+		SurfaceTool::create_vertex_array_from_triangle_arrays(arrays, s.vertices);
 
 		Vector<Vector3> rvertices = arrays[Mesh::ARRAY_VERTEX];
 		int vc = rvertices.size();
@@ -1612,9 +1615,6 @@ void ArrayMesh::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "_surfaces", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR | PROPERTY_USAGE_INTERNAL), "_set_surfaces", "_get_surfaces");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "blend_shape_mode", PROPERTY_HINT_ENUM, "Normalized,Relative"), "set_blend_shape_mode", "get_blend_shape_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::AABB, "custom_aabb", PROPERTY_HINT_NONE, ""), "set_custom_aabb", "get_custom_aabb");
-
-	BIND_ENUM_CONSTANT(BLEND_SHAPE_MODE_NORMALIZED);
-	BIND_ENUM_CONSTANT(BLEND_SHAPE_MODE_RELATIVE);
 }
 
 void ArrayMesh::reload_from_file() {
diff --git a/scene/resources/mesh.h b/scene/resources/mesh.h
index ae2139a0cf..586bb2f802 100644
--- a/scene/resources/mesh.h
+++ b/scene/resources/mesh.h
@@ -53,7 +53,10 @@ public:
 		NO_INDEX_ARRAY = RenderingServer::NO_INDEX_ARRAY,
 		ARRAY_WEIGHTS_SIZE = RenderingServer::ARRAY_WEIGHTS_SIZE
 	};
-
+	enum BlendShapeMode {
+		BLEND_SHAPE_MODE_NORMALIZED = RS::BLEND_SHAPE_MODE_NORMALIZED,
+		BLEND_SHAPE_MODE_RELATIVE = RS::BLEND_SHAPE_MODE_RELATIVE,
+	};
 	enum ArrayType {
 		ARRAY_VERTEX = RenderingServer::ARRAY_VERTEX,
 		ARRAY_NORMAL = RenderingServer::ARRAY_NORMAL,
@@ -171,12 +174,6 @@ class ArrayMesh : public Mesh {
 	Array _get_surfaces() const;
 	void _set_surfaces(const Array &p_data);
 
-public:
-	enum BlendShapeMode {
-		BLEND_SHAPE_MODE_NORMALIZED = RS::BLEND_SHAPE_MODE_NORMALIZED,
-		BLEND_SHAPE_MODE_RELATIVE = RS::BLEND_SHAPE_MODE_RELATIVE,
-	};
-
 private:
 	struct Surface {
 		uint32_t format;
@@ -268,6 +265,6 @@ VARIANT_ENUM_CAST(Mesh::ArrayType);
 VARIANT_ENUM_CAST(Mesh::ArrayFormat);
 VARIANT_ENUM_CAST(Mesh::ArrayCustomFormat);
 VARIANT_ENUM_CAST(Mesh::PrimitiveType);
-VARIANT_ENUM_CAST(ArrayMesh::BlendShapeMode);
+VARIANT_ENUM_CAST(Mesh::BlendShapeMode);
 
 #endif
diff --git a/scene/resources/surface_tool.cpp b/scene/resources/surface_tool.cpp
index 7899627048..129f8a48ce 100644
--- a/scene/resources/surface_tool.cpp
+++ b/scene/resources/surface_tool.cpp
@@ -33,6 +33,9 @@
 #define _VERTEX_SNAP 0.0001
 #define EQ_VERTEX_DIST 0.00001
 
+SurfaceTool::OptimizeVertexCacheFunc SurfaceTool::optimize_vertex_cache_func = nullptr;
+SurfaceTool::SimplifyFunc SurfaceTool::simplify_func = nullptr;
+
 bool SurfaceTool::Vertex::operator==(const Vertex &p_vertex) const {
 	if (vertex != p_vertex.vertex) {
 		return false;
@@ -80,6 +83,10 @@ bool SurfaceTool::Vertex::operator==(const Vertex &p_vertex) const {
 		}
 	}
 
+	if (smooth_group != p_vertex.smooth_group) {
+		return false;
+	}
+
 	return true;
 }
 
@@ -94,6 +101,7 @@ uint32_t SurfaceTool::VertexHasher::hash(const Vertex &p_vtx) {
 	h = hash_djb2_buffer((const uint8_t *)p_vtx.bones.ptr(), p_vtx.bones.size() * sizeof(int), h);
 	h = hash_djb2_buffer((const uint8_t *)p_vtx.weights.ptr(), p_vtx.weights.size() * sizeof(float), h);
 	h = hash_djb2_buffer((const uint8_t *)&p_vtx.custom[0], sizeof(Color) * RS::ARRAY_CUSTOM_COUNT, h);
+	h = hash_djb2_one_32(p_vtx.smooth_group, h);
 	return h;
 }
 
@@ -118,6 +126,8 @@ void SurfaceTool::add_vertex(const Vector3 &p_vertex) {
 	vtx.bones = last_bones;
 	vtx.tangent = last_tangent.normal;
 	vtx.binormal = last_normal.cross(last_tangent.normal).normalized() * last_tangent.d;
+	vtx.smooth_group = last_smooth_group;
+
 	for (int i = 0; i < RS::ARRAY_CUSTOM_COUNT; i++) {
 		vtx.custom[i] = last_custom[i];
 	}
@@ -252,13 +262,8 @@ void SurfaceTool::set_weights(const Vector<float> &p_weights) {
 	last_weights = p_weights;
 }
 
-void SurfaceTool::add_smooth_group(bool p_smooth) {
-	ERR_FAIL_COND(!begun);
-	if (index_array.size()) {
-		smooth_groups[index_array.size()] = p_smooth;
-	} else {
-		smooth_groups[vertex_array.size()] = p_smooth;
-	}
+void SurfaceTool::set_smooth_group(uint32_t p_group) {
+	last_smooth_group = p_group;
 }
 
 void SurfaceTool::add_triangle_fan(const Vector<Vector3> &p_vertices, const Vector<Vector2> &p_uvs, const Vector<Color> &p_colors, const Vector<Vector2> &p_uv2s, const Vector<Vector3> &p_normals, const Vector<Plane> &p_tangents) {
@@ -315,9 +320,8 @@ Array SurfaceTool::commit_to_arrays() {
 				array.resize(varr_len);
 				Vector3 *w = array.ptrw();
 
-				int idx = 0;
-				for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-					const Vertex &v = E->get();
+				for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+					const Vertex &v = vertex_array[idx];
 
 					switch (i) {
 						case Mesh::ARRAY_VERTEX: {
@@ -339,9 +343,8 @@ Array SurfaceTool::commit_to_arrays() {
 				array.resize(varr_len);
 				Vector2 *w = array.ptrw();
 
-				int idx = 0;
-				for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-					const Vertex &v = E->get();
+				for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+					const Vertex &v = vertex_array[idx];
 
 					switch (i) {
 						case Mesh::ARRAY_TEX_UV: {
@@ -360,9 +363,8 @@ Array SurfaceTool::commit_to_arrays() {
 				array.resize(varr_len * 4);
 				float *w = array.ptrw();
 
-				int idx = 0;
-				for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx += 4) {
-					const Vertex &v = E->get();
+				for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+					const Vertex &v = vertex_array[idx];
 
 					w[idx + 0] = v.tangent.x;
 					w[idx + 1] = v.tangent.y;
@@ -381,9 +383,9 @@ Array SurfaceTool::commit_to_arrays() {
 				array.resize(varr_len);
 				Color *w = array.ptrw();
 
-				int idx = 0;
-				for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-					const Vertex &v = E->get();
+				for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+					const Vertex &v = vertex_array[idx];
+
 					w[idx] = v.color;
 				}
 
@@ -400,9 +402,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len * 4);
 						uint8_t *w = array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx * 4 + 0] = CLAMP(int32_t(c.r * 255.0), 0, 255);
 							w[idx * 4 + 1] = CLAMP(int32_t(c.g * 255.0), 0, 255);
@@ -417,9 +419,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len * 4);
 						uint8_t *w = array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx * 4 + 0] = uint8_t(int8_t(CLAMP(int32_t(c.r * 127.0), -128, 127)));
 							w[idx * 4 + 1] = uint8_t(int8_t(CLAMP(int32_t(c.g * 127.0), -128, 127)));
@@ -434,9 +436,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len * 4);
 						uint16_t *w = (uint16_t *)array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx * 2 + 0] = Math::make_half_float(c.r);
 							w[idx * 2 + 1] = Math::make_half_float(c.g);
@@ -449,9 +451,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len * 8);
 						uint16_t *w = (uint16_t *)array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx * 4 + 0] = Math::make_half_float(c.r);
 							w[idx * 4 + 1] = Math::make_half_float(c.g);
@@ -466,9 +468,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len);
 						float *w = (float *)array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx] = c.r;
 						}
@@ -480,9 +482,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len * 2);
 						float *w = (float *)array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx * 2 + 0] = c.r;
 							w[idx * 2 + 1] = c.g;
@@ -495,9 +497,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len * 3);
 						float *w = (float *)array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx * 3 + 0] = c.r;
 							w[idx * 3 + 1] = c.g;
@@ -511,9 +513,9 @@ Array SurfaceTool::commit_to_arrays() {
 						array.resize(varr_len * 4);
 						float *w = (float *)array.ptrw();
 
-						int idx = 0;
-						for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx++) {
-							const Vertex &v = E->get();
+						for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+							const Vertex &v = vertex_array[idx];
+
 							const Color &c = v.custom[idx];
 							w[idx * 4 + 0] = c.r;
 							w[idx * 4 + 1] = c.g;
@@ -533,9 +535,8 @@ Array SurfaceTool::commit_to_arrays() {
 				array.resize(varr_len * count);
 				int *w = array.ptrw();
 
-				int idx = 0;
-				for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx += count) {
-					const Vertex &v = E->get();
+				for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+					const Vertex &v = vertex_array[idx];
 
 					ERR_CONTINUE(v.bones.size() != count);
 
@@ -554,9 +555,9 @@ Array SurfaceTool::commit_to_arrays() {
 				array.resize(varr_len * count);
 				float *w = array.ptrw();
 
-				int idx = 0;
-				for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next(), idx += count) {
-					const Vertex &v = E->get();
+				for (uint32_t idx = 0; idx < vertex_array.size(); idx++) {
+					const Vertex &v = vertex_array[idx];
+
 					ERR_CONTINUE(v.weights.size() != count);
 
 					for (int j = 0; j < count; j++) {
@@ -574,9 +575,8 @@ Array SurfaceTool::commit_to_arrays() {
 				array.resize(index_array.size());
 				int *w = array.ptrw();
 
-				int idx = 0;
-				for (List<int>::Element *E = index_array.front(); E; E = E->next(), idx++) {
-					w[idx] = E->get();
+				for (uint32_t idx = 0; idx < index_array.size(); idx++) {
+					w[idx] = index_array[idx];
 				}
 
 				a[i] = array;
@@ -623,15 +623,16 @@ void SurfaceTool::index() {
 	}
 
 	HashMap<Vertex, int, VertexHasher> indices;
-	List<Vertex> new_vertices;
+	LocalVector<Vertex> old_vertex_array = vertex_array;
+	vertex_array.clear();
 
-	for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next()) {
-		int *idxptr = indices.getptr(E->get());
+	for (uint32_t i = 0; i < old_vertex_array.size(); i++) {
+		int *idxptr = indices.getptr(old_vertex_array[i]);
 		int idx;
 		if (!idxptr) {
 			idx = indices.size();
-			new_vertices.push_back(E->get());
-			indices[E->get()] = idx;
+			vertex_array.push_back(old_vertex_array[i]);
+			indices[old_vertex_array[i]] = idx;
 		} else {
 			idx = *idxptr;
 		}
@@ -639,9 +640,6 @@ void SurfaceTool::index() {
 		index_array.push_back(idx);
 	}
 
-	vertex_array.clear();
-	vertex_array = new_vertices;
-
 	format |= Mesh::ARRAY_FORMAT_INDEX;
 }
 
@@ -649,29 +647,26 @@ void SurfaceTool::deindex() {
 	if (index_array.size() == 0) {
 		return; //nothing to deindex
 	}
-	Vector<Vertex> varr;
-	varr.resize(vertex_array.size());
-	int idx = 0;
-	for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next()) {
-		varr.write[idx++] = E->get();
-	}
+
+	LocalVector<Vertex> old_vertex_array = vertex_array;
 	vertex_array.clear();
-	for (List<int>::Element *E = index_array.front(); E; E = E->next()) {
-		ERR_FAIL_INDEX(E->get(), varr.size());
-		vertex_array.push_back(varr[E->get()]);
+	for (uint32_t i = 0; i < index_array.size(); i++) {
+		uint32_t index = index_array[i];
+		ERR_FAIL_COND(index >= old_vertex_array.size());
+		vertex_array.push_back(old_vertex_array[index]);
 	}
 	format &= ~Mesh::ARRAY_FORMAT_INDEX;
 	index_array.clear();
 }
 
-void SurfaceTool::_create_list(const Ref<Mesh> &p_existing, int p_surface, List<Vertex> *r_vertex, List<int> *r_index, uint32_t &lformat) {
+void SurfaceTool::_create_list(const Ref<Mesh> &p_existing, int p_surface, LocalVector<Vertex> *r_vertex, LocalVector<int> *r_index, uint32_t &lformat) {
 	Array arr = p_existing->surface_get_arrays(p_surface);
 	ERR_FAIL_COND(arr.size() != RS::ARRAY_MAX);
 	_create_list_from_arrays(arr, r_vertex, r_index, lformat);
 }
 
-Vector<SurfaceTool::Vertex> SurfaceTool::create_vertex_array_from_triangle_arrays(const Array &p_arrays, uint32_t *r_format) {
-	Vector<SurfaceTool::Vertex> ret;
+void SurfaceTool::create_vertex_array_from_triangle_arrays(const Array &p_arrays, LocalVector<SurfaceTool::Vertex> &ret, uint32_t *r_format) {
+	ret.clear();
 
 	Vector<Vector3> varr = p_arrays[RS::ARRAY_VERTEX];
 	Vector<Vector3> narr = p_arrays[RS::ARRAY_NORMAL];
@@ -688,7 +683,7 @@ Vector<SurfaceTool::Vertex> SurfaceTool::create_vertex_array_from_triangle_array
 		if (r_format) {
 			*r_format = 0;
 		}
-		return ret;
+		return;
 	}
 
 	int lformat = 0;
@@ -799,19 +794,14 @@ Vector<SurfaceTool::Vertex> SurfaceTool::create_vertex_array_from_triangle_array
 	if (r_format) {
 		*r_format = lformat;
 	}
-
-	return ret;
 }
 
-void SurfaceTool::_create_list_from_arrays(Array arr, List<Vertex> *r_vertex, List<int> *r_index, uint32_t &lformat) {
-	Vector<Vertex> arrays = create_vertex_array_from_triangle_arrays(arr, &lformat);
-	ERR_FAIL_COND(arrays.size() == 0);
-
-	for (int i = 0; i < arrays.size(); i++) {
-		r_vertex->push_back(arrays[i]);
-	}
+void SurfaceTool::_create_list_from_arrays(Array arr, LocalVector<Vertex> *r_vertex, LocalVector<int> *r_index, uint32_t &lformat) {
+	create_vertex_array_from_triangle_arrays(arr, *r_vertex, &lformat);
+	ERR_FAIL_COND(r_vertex->size() == 0);
 
 	//indices
+	r_index->clear();
 
 	Vector<int> idx = arr[RS::ARRAY_INDEX];
 	int is = idx.size();
@@ -864,14 +854,14 @@ void SurfaceTool::append_from(const Ref<Mesh> &p_existing, int p_surface, const
 	}
 
 	uint32_t nformat;
-	List<Vertex> nvertices;
-	List<int> nindices;
+	LocalVector<Vertex> nvertices;
+	LocalVector<int> nindices;
 	_create_list(p_existing, p_surface, &nvertices, &nindices, nformat);
 	format |= nformat;
 	int vfrom = vertex_array.size();
 
-	for (List<Vertex>::Element *E = nvertices.front(); E; E = E->next()) {
-		Vertex v = E->get();
+	for (uint32_t vi = 0; vi < nvertices.size(); vi++) {
+		Vertex v = nvertices[vi];
 		v.vertex = p_xform.xform(v.vertex);
 		if (nformat & RS::ARRAY_FORMAT_NORMAL) {
 			v.normal = p_xform.basis.xform(v.normal);
@@ -884,8 +874,8 @@ void SurfaceTool::append_from(const Ref<Mesh> &p_existing, int p_surface, const
 		vertex_array.push_back(v);
 	}
 
-	for (List<int>::Element *E = nindices.front(); E; E = E->next()) {
-		int dst_index = E->get() + vfrom;
+	for (uint32_t i = 0; i < nindices.size(); i++) {
+		int dst_index = nindices[i] + vfrom;
 		index_array.push_back(dst_index);
 	}
 	if (index_array.size() % 3) {
@@ -896,18 +886,18 @@ void SurfaceTool::append_from(const Ref<Mesh> &p_existing, int p_surface, const
 //mikktspace callbacks
 namespace {
 struct TangentGenerationContextUserData {
-	Vector<List<SurfaceTool::Vertex>::Element *> vertices;
-	Vector<List<int>::Element *> indices;
+	LocalVector<SurfaceTool::Vertex> *vertices;
+	LocalVector<int> *indices;
 };
 } // namespace
 
 int SurfaceTool::mikktGetNumFaces(const SMikkTSpaceContext *pContext) {
 	TangentGenerationContextUserData &triangle_data = *reinterpret_cast<TangentGenerationContextUserData *>(pContext->m_pUserData);
 
-	if (triangle_data.indices.size() > 0) {
-		return triangle_data.indices.size() / 3;
+	if (triangle_data.indices->size() > 0) {
+		return triangle_data.indices->size() / 3;
 	} else {
-		return triangle_data.vertices.size() / 3;
+		return triangle_data.vertices->size() / 3;
 	}
 }
 
@@ -918,13 +908,13 @@ int SurfaceTool::mikktGetNumVerticesOfFace(const SMikkTSpaceContext *pContext, c
 void SurfaceTool::mikktGetPosition(const SMikkTSpaceContext *pContext, float fvPosOut[], const int iFace, const int iVert) {
 	TangentGenerationContextUserData &triangle_data = *reinterpret_cast<TangentGenerationContextUserData *>(pContext->m_pUserData);
 	Vector3 v;
-	if (triangle_data.indices.size() > 0) {
-		int index = triangle_data.indices[iFace * 3 + iVert]->get();
-		if (index < triangle_data.vertices.size()) {
-			v = triangle_data.vertices[index]->get().vertex;
+	if (triangle_data.indices->size() > 0) {
+		uint32_t index = triangle_data.indices->operator[](iFace * 3 + iVert);
+		if (index < triangle_data.vertices->size()) {
+			v = triangle_data.vertices->operator[](index).vertex;
 		}
 	} else {
-		v = triangle_data.vertices[iFace * 3 + iVert]->get().vertex;
+		v = triangle_data.vertices->operator[](iFace * 3 + iVert).vertex;
 	}
 
 	fvPosOut[0] = v.x;
@@ -935,13 +925,13 @@ void SurfaceTool::mikktGetPosition(const SMikkTSpaceContext *pContext, float fvP
 void SurfaceTool::mikktGetNormal(const SMikkTSpaceContext *pContext, float fvNormOut[], const int iFace, const int iVert) {
 	TangentGenerationContextUserData &triangle_data = *reinterpret_cast<TangentGenerationContextUserData *>(pContext->m_pUserData);
 	Vector3 v;
-	if (triangle_data.indices.size() > 0) {
-		int index = triangle_data.indices[iFace * 3 + iVert]->get();
-		if (index < triangle_data.vertices.size()) {
-			v = triangle_data.vertices[index]->get().normal;
+	if (triangle_data.indices->size() > 0) {
+		uint32_t index = triangle_data.indices->operator[](iFace * 3 + iVert);
+		if (index < triangle_data.vertices->size()) {
+			v = triangle_data.vertices->operator[](index).normal;
 		}
 	} else {
-		v = triangle_data.vertices[iFace * 3 + iVert]->get().normal;
+		v = triangle_data.vertices->operator[](iFace * 3 + iVert).normal;
 	}
 
 	fvNormOut[0] = v.x;
@@ -952,13 +942,13 @@ void SurfaceTool::mikktGetNormal(const SMikkTSpaceContext *pContext, float fvNor
 void SurfaceTool::mikktGetTexCoord(const SMikkTSpaceContext *pContext, float fvTexcOut[], const int iFace, const int iVert) {
 	TangentGenerationContextUserData &triangle_data = *reinterpret_cast<TangentGenerationContextUserData *>(pContext->m_pUserData);
 	Vector2 v;
-	if (triangle_data.indices.size() > 0) {
-		int index = triangle_data.indices[iFace * 3 + iVert]->get();
-		if (index < triangle_data.vertices.size()) {
-			v = triangle_data.vertices[index]->get().uv;
+	if (triangle_data.indices->size() > 0) {
+		uint32_t index = triangle_data.indices->operator[](iFace * 3 + iVert);
+		if (index < triangle_data.vertices->size()) {
+			v = triangle_data.vertices->operator[](index).uv;
 		}
 	} else {
-		v = triangle_data.vertices[iFace * 3 + iVert]->get().uv;
+		v = triangle_data.vertices->operator[](iFace * 3 + iVert).uv;
 	}
 
 	fvTexcOut[0] = v.x;
@@ -969,13 +959,13 @@ void SurfaceTool::mikktSetTSpaceDefault(const SMikkTSpaceContext *pContext, cons
 		const tbool bIsOrientationPreserving, const int iFace, const int iVert) {
 	TangentGenerationContextUserData &triangle_data = *reinterpret_cast<TangentGenerationContextUserData *>(pContext->m_pUserData);
 	Vertex *vtx = nullptr;
-	if (triangle_data.indices.size() > 0) {
-		int index = triangle_data.indices[iFace * 3 + iVert]->get();
-		if (index < triangle_data.vertices.size()) {
-			vtx = &triangle_data.vertices[index]->get();
+	if (triangle_data.indices->size() > 0) {
+		uint32_t index = triangle_data.indices->operator[](iFace * 3 + iVert);
+		if (index < triangle_data.vertices->size()) {
+			vtx = &triangle_data.vertices->operator[](index);
 		}
 	} else {
-		vtx = &triangle_data.vertices[iFace * 3 + iVert]->get();
+		vtx = &triangle_data.vertices->operator[](iFace * 3 + iVert);
 	}
 
 	if (vtx != nullptr) {
@@ -1001,18 +991,12 @@ void SurfaceTool::generate_tangents() {
 	msc.m_pInterface = &mkif;
 
 	TangentGenerationContextUserData triangle_data;
-	triangle_data.vertices.resize(vertex_array.size());
-	int idx = 0;
-	for (List<Vertex>::Element *E = vertex_array.front(); E; E = E->next()) {
-		triangle_data.vertices.write[idx++] = E;
-		E->get().binormal = Vector3();
-		E->get().tangent = Vector3();
-	}
-	triangle_data.indices.resize(index_array.size());
-	idx = 0;
-	for (List<int>::Element *E = index_array.front(); E; E = E->next()) {
-		triangle_data.indices.write[idx++] = E;
+	triangle_data.vertices = &vertex_array;
+	for (uint32_t i = 0; i < vertex_array.size(); i++) {
+		vertex_array[i].binormal = Vector3();
+		vertex_array[i].tangent = Vector3();
 	}
+	triangle_data.indices = &index_array;
 	msc.m_pUserData = &triangle_data;
 
 	bool res = genTangSpaceDefault(&msc);
@@ -1028,66 +1012,36 @@ void SurfaceTool::generate_normals(bool p_flip) {
 
 	deindex();
 
+	ERR_FAIL_COND((vertex_array.size() % 3) != 0);
+
 	HashMap<Vertex, Vector3, VertexHasher> vertex_hash;
 
-	int count = 0;
-	bool smooth = false;
-	if (smooth_groups.has(0)) {
-		smooth = smooth_groups[0];
-	}
-
-	List<Vertex>::Element *B = vertex_array.front();
-	for (List<Vertex>::Element *E = B; E;) {
-		List<Vertex>::Element *v[3];
-		v[0] = E;
-		v[1] = v[0]->next();
-		ERR_FAIL_COND(!v[1]);
-		v[2] = v[1]->next();
-		ERR_FAIL_COND(!v[2]);
-		E = v[2]->next();
+	for (uint32_t vi = 0; vi < vertex_array.size(); vi += 3) {
+		Vertex *v = &vertex_array[vi];
 
 		Vector3 normal;
 		if (!p_flip) {
-			normal = Plane(v[0]->get().vertex, v[1]->get().vertex, v[2]->get().vertex).normal;
+			normal = Plane(v[0].vertex, v[1].vertex, v[2].vertex).normal;
 		} else {
-			normal = Plane(v[2]->get().vertex, v[1]->get().vertex, v[0]->get().vertex).normal;
+			normal = Plane(v[2].vertex, v[1].vertex, v[0].vertex).normal;
 		}
 
-		if (smooth) {
-			for (int i = 0; i < 3; i++) {
-				Vector3 *lv = vertex_hash.getptr(v[i]->get());
-				if (!lv) {
-					vertex_hash.set(v[i]->get(), normal);
-				} else {
-					(*lv) += normal;
-				}
-			}
-		} else {
-			for (int i = 0; i < 3; i++) {
-				v[i]->get().normal = normal;
-			}
-		}
-		count += 3;
-
-		if (smooth_groups.has(count) || !E) {
-			if (vertex_hash.size()) {
-				while (B != E) {
-					Vector3 *lv = vertex_hash.getptr(B->get());
-					if (lv) {
-						B->get().normal = lv->normalized();
-					}
-
-					B = B->next();
-				}
-
+		for (int i = 0; i < 3; i++) {
+			Vector3 *lv = vertex_hash.getptr(v[i]);
+			if (!lv) {
+				vertex_hash.set(v[i], normal);
 			} else {
-				B = E;
+				(*lv) += normal;
 			}
+		}
+	}
 
-			vertex_hash.clear();
-			if (E) {
-				smooth = smooth_groups[count];
-			}
+	for (uint32_t vi = 0; vi < vertex_array.size(); vi++) {
+		Vector3 *lv = vertex_hash.getptr(vertex_array[vi]);
+		if (!lv) {
+			vertex_array[vi].normal = Vector3();
+		} else {
+			vertex_array[vi].normal = lv->normalized();
 		}
 	}
 
@@ -1095,7 +1049,6 @@ void SurfaceTool::generate_normals(bool p_flip) {
 
 	if (was_indexed) {
 		index();
-		smooth_groups.clear();
 	}
 }
 
@@ -1111,8 +1064,8 @@ void SurfaceTool::clear() {
 	last_weights.clear();
 	index_array.clear();
 	vertex_array.clear();
-	smooth_groups.clear();
 	material.unref();
+	last_smooth_group = 0;
 	for (int i = 0; i < RS::ARRAY_CUSTOM_COUNT; i++) {
 		last_custom_format[i] = CUSTOM_MAX;
 	}
@@ -1136,6 +1089,51 @@ SurfaceTool::CustomFormat SurfaceTool::get_custom_format(int p_index) const {
 	ERR_FAIL_INDEX_V(p_index, RS::ARRAY_CUSTOM_COUNT, CUSTOM_MAX);
 	return last_custom_format[p_index];
 }
+void SurfaceTool::optimize_indices_for_cache() {
+	ERR_FAIL_COND(optimize_vertex_cache_func == nullptr);
+	ERR_FAIL_COND(index_array.size() == 0);
+
+	LocalVector old_index_array = index_array;
+	zeromem(index_array.ptr(), index_array.size() * sizeof(int));
+	optimize_vertex_cache_func((unsigned int *)index_array.ptr(), (unsigned int *)old_index_array.ptr(), old_index_array.size(), vertex_array.size());
+}
+
+float SurfaceTool::get_max_axis_length() const {
+	ERR_FAIL_COND_V(vertex_array.size() == 0, 0);
+
+	AABB aabb;
+	for (uint32_t i = 0; i < vertex_array.size(); i++) {
+		if (i == 0) {
+			aabb.position = vertex_array[i].vertex;
+		} else {
+			aabb.expand_to(vertex_array[i].vertex);
+		}
+	}
+
+	return aabb.get_longest_axis_size();
+}
+Vector<int> SurfaceTool::generate_lod(float p_threshold, int p_target_index_count) {
+	Vector<int> lod;
+
+	ERR_FAIL_COND_V(simplify_func == nullptr, lod);
+	ERR_FAIL_COND_V(vertex_array.size() == 0, lod);
+	ERR_FAIL_COND_V(index_array.size() == 0, lod);
+
+	lod.resize(index_array.size());
+	LocalVector<float> vertices; //uses floats
+	vertices.resize(vertex_array.size() * 3);
+	for (uint32_t i = 0; i < vertex_array.size(); i++) {
+		vertices[i * 3 + 0] = vertex_array[i].vertex.x;
+		vertices[i * 3 + 1] = vertex_array[i].vertex.y;
+		vertices[i * 3 + 2] = vertex_array[i].vertex.z;
+	}
+
+	uint32_t index_count = simplify_func((unsigned int *)lod.ptrw(), (unsigned int *)index_array.ptr(), index_array.size(), vertices.ptr(), vertex_array.size(), sizeof(float) * 3, p_target_index_count, p_threshold);
+	ERR_FAIL_COND_V(index_count == 0, lod);
+	lod.resize(index_count);
+
+	return lod;
+}
 
 void SurfaceTool::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_skin_weight_count", "count"), &SurfaceTool::set_skin_weight_count);
@@ -1155,7 +1153,7 @@ void SurfaceTool::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_bones", "bones"), &SurfaceTool::set_bones);
 	ClassDB::bind_method(D_METHOD("set_weights", "weights"), &SurfaceTool::set_weights);
 	ClassDB::bind_method(D_METHOD("set_custom", "index", "custom"), &SurfaceTool::set_custom);
-	ClassDB::bind_method(D_METHOD("add_smooth_group", "smooth"), &SurfaceTool::add_smooth_group);
+	ClassDB::bind_method(D_METHOD("set_smooth_group", "index"), &SurfaceTool::set_smooth_group);
 
 	ClassDB::bind_method(D_METHOD("add_triangle_fan", "vertices", "uvs", "colors", "uv2s", "normals", "tangents"), &SurfaceTool::add_triangle_fan, DEFVAL(Vector<Vector2>()), DEFVAL(Vector<Color>()), DEFVAL(Vector<Vector2>()), DEFVAL(Vector<Vector3>()), DEFVAL(Vector<Plane>()));
 
@@ -1166,6 +1164,11 @@ void SurfaceTool::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("generate_normals", "flip"), &SurfaceTool::generate_normals, DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("generate_tangents"), &SurfaceTool::generate_tangents);
 
+	ClassDB::bind_method(D_METHOD("optimize_indices_for_cache"), &SurfaceTool::optimize_indices_for_cache);
+
+	ClassDB::bind_method(D_METHOD("get_max_axis_length"), &SurfaceTool::get_max_axis_length);
+	ClassDB::bind_method(D_METHOD("generate_lod", "nd_threshold", "target_index_count"), &SurfaceTool::generate_lod, DEFVAL(3));
+
 	ClassDB::bind_method(D_METHOD("set_material", "material"), &SurfaceTool::set_material);
 
 	ClassDB::bind_method(D_METHOD("clear"), &SurfaceTool::clear);
diff --git a/scene/resources/surface_tool.h b/scene/resources/surface_tool.h
index 4a5c7d990c..e80a5339a9 100644
--- a/scene/resources/surface_tool.h
+++ b/scene/resources/surface_tool.h
@@ -31,8 +31,8 @@
 #ifndef SURFACE_TOOL_H
 #define SURFACE_TOOL_H
 
+#include "core/templates/local_vector.h"
 #include "scene/resources/mesh.h"
-
 #include "thirdparty/misc/mikktspace.h"
 
 class SurfaceTool : public Reference {
@@ -50,6 +50,7 @@ public:
 		Vector<int> bones;
 		Vector<float> weights;
 		Color custom[RS::ARRAY_CUSTOM_COUNT];
+		uint32_t smooth_group = 0;
 
 		bool operator==(const Vertex &p_vertex) const;
 
@@ -73,6 +74,11 @@ public:
 		SKIN_8_WEIGHTS
 	};
 
+	typedef void (*OptimizeVertexCacheFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, size_t vertex_count);
+	static OptimizeVertexCacheFunc optimize_vertex_cache_func;
+	typedef size_t (*SimplifyFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+	static SimplifyFunc simplify_func;
+
 private:
 	struct VertexHasher {
 		static _FORCE_INLINE_ uint32_t hash(const Vertex &p_vtx);
@@ -92,9 +98,8 @@ private:
 	uint32_t format;
 	Ref<Material> material;
 	//arrays
-	List<Vertex> vertex_array;
-	List<int> index_array;
-	Map<int, bool> smooth_groups;
+	LocalVector<Vertex> vertex_array;
+	LocalVector<int> index_array;
 
 	//memory
 	Color last_color;
@@ -104,6 +109,7 @@ private:
 	Vector<int> last_bones;
 	Vector<float> last_weights;
 	Plane last_tangent;
+	uint32_t last_smooth_group = 0;
 
 	SkinWeightCount skin_weights;
 
@@ -111,8 +117,8 @@ private:
 
 	CustomFormat last_custom_format[RS::ARRAY_CUSTOM_COUNT];
 
-	void _create_list_from_arrays(Array arr, List<Vertex> *r_vertex, List<int> *r_index, uint32_t &lformat);
-	void _create_list(const Ref<Mesh> &p_existing, int p_surface, List<Vertex> *r_vertex, List<int> *r_index, uint32_t &lformat);
+	void _create_list_from_arrays(Array arr, LocalVector<Vertex> *r_vertex, LocalVector<int> *r_index, uint32_t &lformat);
+	void _create_list(const Ref<Mesh> &p_existing, int p_surface, LocalVector<Vertex> *r_vertex, LocalVector<int> *r_index, uint32_t &lformat);
 
 	//mikktspace callbacks
 	static int mikktGetNumFaces(const SMikkTSpaceContext *pContext);
@@ -143,10 +149,10 @@ public:
 	void set_custom(int p_index, const Color &p_custom);
 	void set_bones(const Vector<int> &p_bones);
 	void set_weights(const Vector<float> &p_weights);
+	void set_smooth_group(uint32_t p_group);
 
 	void add_vertex(const Vector3 &p_vertex);
 
-	void add_smooth_group(bool p_smooth);
 	void add_triangle_fan(const Vector<Vector3> &p_vertices, const Vector<Vector2> &p_uvs = Vector<Vector2>(), const Vector<Color> &p_colors = Vector<Color>(), const Vector<Vector2> &p_uv2s = Vector<Vector2>(), const Vector<Vector3> &p_normals = Vector<Vector3>(), const Vector<Plane> &p_tangents = Vector<Plane>());
 
 	void add_index(int p_index);
@@ -156,14 +162,18 @@ public:
 	void generate_normals(bool p_flip = false);
 	void generate_tangents();
 
+	void optimize_indices_for_cache();
+	float get_max_axis_length() const;
+	Vector<int> generate_lod(float p_threshold, int p_target_index_count = 3);
+
 	void set_material(const Ref<Material> &p_material);
 
 	void clear();
 
-	List<Vertex> &get_vertex_array() { return vertex_array; }
+	LocalVector<Vertex> &get_vertex_array() { return vertex_array; }
 
 	void create_from_triangle_arrays(const Array &p_arrays);
-	static Vector<Vertex> create_vertex_array_from_triangle_arrays(const Array &p_arrays, uint32_t *r_format = nullptr);
+	static void create_vertex_array_from_triangle_arrays(const Array &p_arrays, LocalVector<Vertex> &ret, uint32_t *r_format = nullptr);
 	Array commit_to_arrays();
 	void create_from(const Ref<Mesh> &p_existing, int p_surface);
 	void create_from_blend_shape(const Ref<Mesh> &p_existing, int p_surface, const String &p_blend_shape_name);
diff --git a/thirdparty/README.md b/thirdparty/README.md
index b2707e7f7c..3962a83597 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -357,6 +357,16 @@ File extracted from upstream release tarball:
 - Added 2 files `godot_core_mbedtls_platform.{c,h}` providing configuration
   for light bundling with core.
 
+## meshoptimizer
+
+- Upstream: https://github.com/zeux/meshoptimizer
+- Version: 0.15(2020)
+- License: MIT
+
+File extracted from upstream release tarball:
+
+- Files in src/ go to thirdparty/meshoptimizer
+
 
 ## miniupnpc
 
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
new file mode 100644
index 0000000000..4fcd766d22
--- /dev/null
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016-2020 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/thirdparty/meshoptimizer/allocator.cpp b/thirdparty/meshoptimizer/allocator.cpp
new file mode 100644
index 0000000000..da7cc540b2
--- /dev/null
+++ b/thirdparty/meshoptimizer/allocator.cpp
@@ -0,0 +1,8 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*))
+{
+	meshopt_Allocator::Storage::allocate = allocate;
+	meshopt_Allocator::Storage::deallocate = deallocate;
+}
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
new file mode 100644
index 0000000000..f7d88c5136
--- /dev/null
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -0,0 +1,351 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
+// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
+// Jack Ritter. An Efficient Bounding Sphere. 1990
+namespace meshopt
+{
+
+static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+{
+	assert(count > 0);
+
+	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
+	size_t pmin[3] = {0, 0, 0};
+	size_t pmax[3] = {0, 0, 0};
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+
+		for (int axis = 0; axis < 3; ++axis)
+		{
+			pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
+			pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+		}
+	}
+
+	// find the pair of points with largest distance
+	float paxisd2 = 0;
+	int paxis = 0;
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		const float* p1 = points[pmin[axis]];
+		const float* p2 = points[pmax[axis]];
+
+		float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+
+		if (d2 > paxisd2)
+		{
+			paxisd2 = d2;
+			paxis = axis;
+		}
+	}
+
+	// use the longest segment as the initial sphere diameter
+	const float* p1 = points[pmin[paxis]];
+	const float* p2 = points[pmax[paxis]];
+
+	float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
+	float radius = sqrtf(paxisd2) / 2;
+
+	// iteratively adjust the sphere up until all points fit
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+		if (d2 > radius * radius)
+		{
+			float d = sqrtf(d2);
+			assert(d > 0);
+
+			float k = 0.5f + (radius / d) / 2;
+
+			center[0] = center[0] * k + p[0] * (1 - k);
+			center[1] = center[1] * k + p[1] * (1 - k);
+			center[2] = center[2] * k + p[2] * (1 - k);
+			radius = (radius + d) / 2;
+		}
+	}
+
+	result[0] = center[0];
+	result[1] = center[1];
+	result[2] = center[2];
+	result[3] = radius;
+}
+
+} // namespace meshopt
+
+size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	// meshlet construction is limited by max vertices and max triangles per meshlet
+	// the worst case is that the input is an unindexed stream since this equally stresses both limits
+	// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
+	size_t max_vertices_conservative = max_vertices - 2;
+	size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
+	size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
+
+	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
+}
+
+size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	meshopt_Allocator allocator;
+
+	meshopt_Meshlet meshlet;
+	memset(&meshlet, 0, sizeof(meshlet));
+
+	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
+	assert(max_triangles <= sizeof(meshlet.indices) / 3);
+
+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, -1, vertex_count);
+
+	size_t offset = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		unsigned char& av = used[a];
+		unsigned char& bv = used[b];
+		unsigned char& cv = used[c];
+
+		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+		{
+			destination[offset++] = meshlet;
+
+			for (size_t j = 0; j < meshlet.vertex_count; ++j)
+				used[meshlet.vertices[j]] = 0xff;
+
+			memset(&meshlet, 0, sizeof(meshlet));
+		}
+
+		if (av == 0xff)
+		{
+			av = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = a;
+		}
+
+		if (bv == 0xff)
+		{
+			bv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = b;
+		}
+
+		if (cv == 0xff)
+		{
+			cv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = c;
+		}
+
+		meshlet.indices[meshlet.triangle_count][0] = av;
+		meshlet.indices[meshlet.triangle_count][1] = bv;
+		meshlet.indices[meshlet.triangle_count][2] = cv;
+		meshlet.triangle_count++;
+	}
+
+	if (meshlet.triangle_count)
+		destination[offset++] = meshlet;
+
+	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+
+	return offset;
+}
+
+meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(index_count / 3 <= 256);
+
+	(void)vertex_count;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	// compute triangle normals and gather triangle corners
+	float normals[256][3];
+	float corners[256][3][3];
+	size_t triangles = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* p0 = vertex_positions + vertex_stride_float * a;
+		const float* p1 = vertex_positions + vertex_stride_float * b;
+		const float* p2 = vertex_positions + vertex_stride_float * c;
+
+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+		// no need to include degenerate triangles - they will be invisible anyway
+		if (area == 0.f)
+			continue;
+
+		// record triangle normals & corners for future use; normal and corner 0 define a plane equation
+		normals[triangles][0] = normalx / area;
+		normals[triangles][1] = normaly / area;
+		normals[triangles][2] = normalz / area;
+		memcpy(corners[triangles][0], p0, 3 * sizeof(float));
+		memcpy(corners[triangles][1], p1, 3 * sizeof(float));
+		memcpy(corners[triangles][2], p2, 3 * sizeof(float));
+		triangles++;
+	}
+
+	meshopt_Bounds bounds = {};
+
+	// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
+	if (triangles == 0)
+		return bounds;
+
+	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
+	float psphere[4] = {};
+	computeBoundingSphere(psphere, corners[0], triangles * 3);
+
+	float center[3] = {psphere[0], psphere[1], psphere[2]};
+
+	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
+	float nsphere[4] = {};
+	computeBoundingSphere(nsphere, normals, triangles);
+
+	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
+	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
+	float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
+
+	axis[0] *= invaxislength;
+	axis[1] *= invaxislength;
+	axis[2] *= invaxislength;
+
+	// compute a tight cone around all normals, mindp = cos(angle/2)
+	float mindp = 1.f;
+
+	for (size_t i = 0; i < triangles; ++i)
+	{
+		float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
+
+		mindp = (dp < mindp) ? dp : mindp;
+	}
+
+	// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
+	bounds.center[0] = center[0];
+	bounds.center[1] = center[1];
+	bounds.center[2] = center[2];
+	bounds.radius = psphere[3];
+
+	// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
+	// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
+	// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
+	if (mindp <= 0.1f)
+	{
+		bounds.cone_cutoff = 1;
+		bounds.cone_cutoff_s8 = 127;
+		return bounds;
+	}
+
+	float maxt = 0;
+
+	// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
+	for (size_t i = 0; i < triangles; ++i)
+	{
+		// dot(center-t*axis-corner, trinormal) = 0
+		// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
+		float cx = center[0] - corners[i][0][0];
+		float cy = center[1] - corners[i][0][1];
+		float cz = center[2] - corners[i][0][2];
+
+		float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
+		float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
+
+		// dn should be larger than mindp cutoff above
+		assert(dn > 0.f);
+		float t = dc / dn;
+
+		maxt = (t > maxt) ? t : maxt;
+	}
+
+	// cone apex should be in the negative half-space of all cluster triangles by construction
+	bounds.cone_apex[0] = center[0] - axis[0] * maxt;
+	bounds.cone_apex[1] = center[1] - axis[1] * maxt;
+	bounds.cone_apex[2] = center[2] - axis[2] * maxt;
+
+	// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
+	bounds.cone_axis[0] = axis[0];
+	bounds.cone_axis[1] = axis[1];
+	bounds.cone_axis[2] = axis[2];
+
+	// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
+	// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
+	bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
+
+	// quantize axis & cutoff to 8-bit SNORM format
+	bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
+	bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
+	bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
+
+	// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
+	float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
+	float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
+	float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
+
+	// note that we need to round this up instead of rounding to nearest, hence +1
+	int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
+
+	bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
+
+	return bounds;
+}
+
+meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+
+	for (size_t i = 0; i < meshlet->triangle_count; ++i)
+	{
+		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
+		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
+		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
+
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		indices[i * 3 + 0] = a;
+		indices[i * 3 + 1] = b;
+		indices[i * 3 + 2] = c;
+	}
+
+	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+}
diff --git a/thirdparty/meshoptimizer/indexcodec.cpp b/thirdparty/meshoptimizer/indexcodec.cpp
new file mode 100644
index 0000000000..eeb541e5be
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexcodec.cpp
@@ -0,0 +1,752 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
+// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
+namespace meshopt
+{
+
+const unsigned char kIndexHeader = 0xe0;
+const unsigned char kSequenceHeader = 0xd0;
+
+static int gEncodeIndexVersion = 0;
+
+typedef unsigned int VertexFifo[16];
+typedef unsigned int EdgeFifo[16][2];
+
+static const unsigned int kTriangleIndexOrder[3][3] = {
+    {0, 1, 2},
+    {1, 2, 0},
+    {2, 0, 1},
+};
+
+static const unsigned char kCodeAuxEncodingTable[16] = {
+    0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
+    0, 0, // last two entries aren't used for encoding
+};
+
+static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
+{
+	(void)a;
+
+	return (b == next) ? 1 : (c == next) ? 2 : 0;
+}
+
+static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		unsigned int e0 = fifo[index][0];
+		unsigned int e1 = fifo[index][1];
+
+		if (e0 == a && e1 == b)
+			return (i << 2) | 0;
+		if (e0 == b && e1 == c)
+			return (i << 2) | 1;
+		if (e0 == c && e1 == a)
+			return (i << 2) | 2;
+	}
+
+	return -1;
+}
+
+static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
+{
+	fifo[offset][0] = a;
+	fifo[offset][1] = b;
+	offset = (offset + 1) & 15;
+}
+
+static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		if (fifo[index] == v)
+			return i;
+	}
+
+	return -1;
+}
+
+static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
+{
+	fifo[offset] = v;
+	offset = (offset + cond) & 15;
+}
+
+static void encodeVByte(unsigned char*& data, unsigned int v)
+{
+	// encode 32-bit value in up to 5 7-bit groups
+	do
+	{
+		*data++ = (v & 127) | (v > 127 ? 128 : 0);
+		v >>= 7;
+	} while (v);
+}
+
+static unsigned int decodeVByte(const unsigned char*& data)
+{
+	unsigned char lead = *data++;
+
+	// fast path: single byte
+	if (lead < 128)
+		return lead;
+
+	// slow path: up to 4 extra bytes
+	// note that this loop always terminates, which is important for malformed data
+	unsigned int result = lead & 127;
+	unsigned int shift = 7;
+
+	for (int i = 0; i < 4; ++i)
+	{
+		unsigned char group = *data++;
+		result |= (group & 127) << shift;
+		shift += 7;
+
+		if (group < 128)
+			break;
+	}
+
+	return result;
+}
+
+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
+{
+	unsigned int d = index - last;
+	unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+	encodeVByte(data, v);
+}
+
+static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
+{
+	unsigned int v = decodeVByte(data);
+	unsigned int d = (v >> 1) ^ -int(v & 1);
+
+	return last + d;
+}
+
+static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
+{
+	for (int i = 0; i < 16; ++i)
+		if (table[i] == v)
+			return i;
+
+	return -1;
+}
+
+static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
+{
+	if (index_size == 2)
+	{
+		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
+		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
+		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
+	}
+	else
+	{
+		static_cast<unsigned int*>(destination)[offset + 0] = a;
+		static_cast<unsigned int*>(destination)[offset + 1] = b;
+		static_cast<unsigned int*>(destination)[offset + 2] = c;
+	}
+}
+
+#if TRACE
+static size_t sortTop16(unsigned char dest[16], size_t stats[256])
+{
+	size_t destsize = 0;
+
+	for (size_t i = 0; i < 256; ++i)
+	{
+		size_t j = 0;
+		for (; j < destsize; ++j)
+		{
+			if (stats[i] >= stats[dest[j]])
+			{
+				if (destsize < 16)
+					destsize++;
+
+				memmove(&dest[j + 1], &dest[j], destsize - 1 - j);
+				dest[j] = (unsigned char)i;
+				break;
+			}
+		}
+
+		if (j == destsize && destsize < 16)
+		{
+			dest[destsize] = (unsigned char)i;
+			destsize++;
+		}
+	}
+
+	return destsize;
+}
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+#if TRACE
+	size_t codestats[256] = {};
+	size_t codeauxstats[256] = {};
+#endif
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kIndexHeader | version);
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	unsigned char* code = buffer + 1;
+	unsigned char* data = code + index_count / 3;
+	unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// use static encoding table; it's possible to pack the result and then build an optimal table and repack
+	// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
+	const unsigned char* codeaux_table = kCodeAuxEncodingTable;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough space to write a triangle
+		// each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can write without extra bounds checks
+		if (data > data_safe_end)
+			return 0;
+
+		int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
+
+		if (fer >= 0 && (fer >> 2) < 15)
+		{
+			const unsigned int* order = kTriangleIndexOrder[fer & 3];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// encode edge index and vertex fifo index, next or free index
+			int fe = fer >> 2;
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next) ? (next++, 0) : 15;
+
+			if (fec == 15 && version >= 1)
+			{
+				// encode last-1 and last+1 to optimize strip-like sequences
+				if (c + 1 == last)
+					fec = 13, last = c;
+				if (c == last + 1)
+					fec = 14, last = c;
+			}
+
+			*code++ = (unsigned char)((fe << 4) | fec);
+
+#if TRACE
+			codestats[code[-1]]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// we only need to push third vertex since first two are likely already in the vertex fifo
+			if (fec == 0 || fec >= fecmax)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// we only need to push two new edges to edge fifo since the third one is already there
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+		else
+		{
+			int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
+			const unsigned int* order = kTriangleIndexOrder[rotation];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// if a/b/c are 0/1/2, we emit a reset code
+			bool reset = false;
+
+			if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
+			{
+				reset = true;
+				next = 0;
+
+				// reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
+				// this makes sure next continues to get incremented instead of being stuck
+				memset(vertexfifo, -1, sizeof(vertexfifo));
+			}
+
+			int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+			int fea = (a == next) ? (next++, 0) : 15;
+			int feb = (fb >= 0 && fb < 14) ? (fb + 1) : (b == next) ? (next++, 0) : 15;
+			int fec = (fc >= 0 && fc < 14) ? (fc + 1) : (c == next) ? (next++, 0) : 15;
+
+			// we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
+			unsigned char codeaux = (unsigned char)((feb << 4) | fec);
+			int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
+
+			// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
+			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
+			{
+				*code++ = (unsigned char)((15 << 4) | codeauxindex);
+			}
+			else
+			{
+				*code++ = (unsigned char)((15 << 4) | 14 | fea);
+				*data++ = codeaux;
+			}
+
+#if TRACE
+			codestats[code[-1]]++;
+			codeauxstats[codeaux]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fea == 15)
+				encodeIndex(data, a, last), last = a;
+
+			if (feb == 15)
+				encodeIndex(data, b, last), last = b;
+
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// only push vertices that weren't already in fifo
+			if (fea == 0 || fea == 15)
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+
+			if (feb == 0 || feb == 15)
+				pushVertexFifo(vertexfifo, b, vertexfifooffset);
+
+			if (fec == 0 || fec == 15)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
+			pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+	}
+
+	// make sure we have enough space to write codeaux table
+	if (data > data_safe_end)
+		return 0;
+
+	// add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
+	// we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
+	// this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
+	for (size_t i = 0; i < 16; ++i)
+	{
+		// decoder assumes that table entries never refer to separately encoded indices
+		assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
+
+		*data++ = codeaux_table[i];
+	}
+
+	// since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
+	assert(codeaux_table[0] == 0);
+
+	assert(data >= buffer + index_count / 3 + 16);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	unsigned char codetop[16], codeauxtop[16];
+	size_t codetopsize = sortTop16(codetop, codestats);
+	size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats);
+
+	size_t sumcode = 0, sumcodeaux = 0;
+	for (size_t i = 0; i < 256; ++i)
+		sumcode += codestats[i], sumcodeaux += codeauxstats[i];
+
+	size_t acccode = 0, acccodeaux = 0;
+
+	printf("code\t\t\t\t\tcodeaux\n");
+
+	for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i)
+	{
+		acccode += codestats[codetop[i]];
+		acccodeaux += codeauxstats[codeauxtop[i]];
+
+		printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n",
+		       int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100,
+		       int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100);
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
+	unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
+
+	return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
+}
+
+void meshopt_encodeIndexVersion(int version)
+{
+	assert(unsigned(version) <= 1);
+
+	meshopt::gEncodeIndexVersion = version;
+}
+
+int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(index_size == 2 || index_size == 4);
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kIndexHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > 1)
+		return -1;
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
+	const unsigned char* code = buffer + 1;
+	const unsigned char* data = code + index_count / 3;
+	const unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	const unsigned char* codeaux_table = data_safe_end;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough data to read for a triangle
+		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can read without extra bounds checks
+		if (data > data_safe_end)
+			return -2;
+
+		unsigned char codetri = *code++;
+
+		if (codetri < 0xf0)
+		{
+			int fe = codetri >> 4;
+
+			// fifo reads are wrapped around 16 entry buffer
+			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
+			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+
+			int fec = codetri & 15;
+
+			// note: this is the most common path in the entire decoder
+			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
+			if (fec < fecmax)
+			{
+				// fifo reads are wrapped around 16 entry buffer
+				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				unsigned int c = 0;
+
+				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
+				// note that we need to update the last index since free indices are delta-encoded
+				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+		else
+		{
+			// fast path: read codeaux from the table
+			if (codetri < 0xfe)
+			{
+				unsigned char codeaux = codeaux_table[codetri & 15];
+
+				// note: table can't contain feb/fec=15
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = next++;
+
+				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int b = (feb == 0) ? next : bf;
+
+				int feb0 = feb == 0;
+				next += feb0;
+
+				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				// slow path: read a full byte for codeaux instead of using a table lookup
+				unsigned char codeaux = *data++;
+
+				int fea = codetri == 0xfe ? 0 : 15;
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// reset: codeaux is 0 but encoded as not-a-table
+				if (codeaux == 0)
+					next = 0;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = (fea == 0) ? next++ : 0;
+				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
+
+				// note that we need to update the last index since free indices are delta-encoded
+				if (fea == 15)
+					last = a = decodeIndex(data, last);
+
+				if (feb == 15)
+					last = b = decodeIndex(data, last);
+
+				if (fec == 15)
+					last = c = decodeIndex(data, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and codeaux table
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
+
+size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kSequenceHeader | version);
+
+	unsigned int last[2] = {};
+	unsigned int current = 0;
+
+	unsigned char* data = buffer + 1;
+	unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to write
+		// each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can write without extra bounds checks
+		if (data >= data_safe_end)
+			return 0;
+
+		unsigned int index = indices[i];
+
+		// this is a heuristic that switches between baselines when the delta grows too large
+		// we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
+		// for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
+		int cd = int(index - last[current]);
+		current ^= ((cd < 0 ? -cd : cd) >= 30);
+
+		// encode delta from the last index
+		unsigned int d = index - last[current];
+		unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+		// note: low bit encodes the index of the last baseline which will be used for reconstruction
+		encodeVByte(data, (v << 1) | current);
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+	}
+
+	// make sure we have enough space to write tail
+	if (data > data_safe_end)
+		return 0;
+
+	for (int k = 0; k < 4; ++k)
+		*data++ = 0;
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
+{
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
+	unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
+
+	return 1 + index_count * vertex_groups + 4;
+}
+
+int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kSequenceHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > 1)
+		return -1;
+
+	const unsigned char* data = buffer + 1;
+	const unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	unsigned int last[2] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to read
+		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can read without extra bounds checks
+		if (data >= data_safe_end)
+			return -2;
+
+		unsigned int v = decodeVByte(data);
+
+		// decode the index of the last baseline
+		unsigned int current = v & 1;
+		v >>= 1;
+
+		// reconstruct index as a delta
+		unsigned int d = (v >> 1) ^ -int(v & 1);
+		unsigned int index = last[current] + d;
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+
+		if (index_size == 2)
+		{
+			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
+		}
+		else
+		{
+			static_cast<unsigned int*>(destination)[i] = index;
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and tail
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
new file mode 100644
index 0000000000..aa4a30efa4
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -0,0 +1,347 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+namespace meshopt
+{
+
+static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
+{
+	// MurmurHash2
+	const unsigned int m = 0x5bd1e995;
+	const int r = 24;
+
+	while (len >= 4)
+	{
+		unsigned int k = *reinterpret_cast<const unsigned int*>(key);
+
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+
+		key += 4;
+		len -= 4;
+	}
+
+	return h;
+}
+
+struct VertexHasher
+{
+	const unsigned char* vertices;
+	size_t vertex_size;
+	size_t vertex_stride;
+
+	size_t hash(unsigned int index) const
+	{
+		return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
+	}
+};
+
+struct VertexStreamHasher
+{
+	const meshopt_Stream* streams;
+	size_t stream_count;
+
+	size_t hash(unsigned int index) const
+	{
+		unsigned int h = 0;
+
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			h = hashUpdate4(h, data + index * s.stride, s.size);
+		}
+
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
+				return false;
+		}
+
+		return true;
+	}
+};
+
+static size_t hashBuckets(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return 0;
+}
+
+} // namespace meshopt
+
+size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place remap
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
+		}
+	}
+}
+
+void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+	assert(index_count % 3 == 0);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(remap[index] != ~0u);
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size <= vertex_stride);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
new file mode 100644
index 0000000000..a442d103c8
--- /dev/null
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -0,0 +1,948 @@
+/**
+ * meshoptimizer - version 0.15
+ *
+ * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
+ *
+ * This library is distributed under the MIT License. See notice at the end of this file.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stddef.h>
+
+/* Version macro; major * 1000 + minor * 10 + patch */
+#define MESHOPTIMIZER_VERSION 150 /* 0.15 */
+
+/* If no API is defined, assume default */
+#ifndef MESHOPTIMIZER_API
+#define MESHOPTIMIZER_API
+#endif
+
+/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+
+/* C interface */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Vertex attribute stream, similar to glVertexPointer
+ * Each element takes size bytes, with stride controlling the spacing between successive elements.
+ */
+struct meshopt_Stream
+{
+	const void* data;
+	size_t size;
+	size_t stride;
+};
+
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Generates a vertex remap table from multiple vertex streams and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting vertex buffer (unique_vertex_count elements, returned by meshopt_generateVertexRemap)
+ * vertex_count should be the initial vertex count and not the value returned by meshopt_generateVertexRemap
+ */
+MESHOPTIMIZER_API void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap);
+
+/**
+ * Generate index buffer from the source index buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Vertex transform cache optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for strip-like caches
+ * Produces inferior results to meshopt_optimizeVertexCache from the GPU vertex cache perspective
+ * However, the resulting index order is more optimal if the goal is to reduce the triangle strip length or improve compression efficiency
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for FIFO caches
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * cache_size should be less than the actual GPU cache size to avoid cache thrashing
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+
+/**
+ * Overdraw optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
+ */
+MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+
+/**
+ * Vertex fetch cache optimizer
+ * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
+ * indices is used both as an input and as an output index buffer
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Vertex fetch cache optimizer
+ * Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * The resulting remap table should be used to reorder vertex/index buffers using meshopt_remapVertexBuffer/meshopt_remapIndexBuffer
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Index buffer encoder
+ * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original.
+ * Input index buffer must represent a triangle list.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first.
+ *
+ * buffer must contain enough space for the encoded index buffer (use meshopt_encodeIndexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Experimental: Set index encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeIndexVersion(int version);
+
+/**
+ * Index buffer decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Experimental: Index sequence encoder
+ * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
+ * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ *
+ * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Index sequence decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexSequence
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index sequence (index_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer encoder
+ * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
+ * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Set vertex encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeVertexVersion(int version);
+
+/**
+ * Vertex buffer decoder
+ * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer filters
+ * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
+ * count must be aligned by 4 and stride is fixed for each function to facilitate SIMD implementation.
+ *
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ *
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
+ *
+ * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
+ * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Mesh simplifier
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+
+/**
+ * Experimental: Mesh simplifier (sloppy)
+ * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
+ * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+
+/**
+ * Experimental: Point cloud simplifier
+ * Reduces the number of points in the cloud to reach the given target
+ * Returns the number of points after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
+
+/**
+ * Mesh stripifier
+ * Converts a previously vertex cache optimized triangle list to triangle strip, stitching strips using restart index or degenerate triangles
+ * Returns the number of indices in the resulting strip, with destination containing new index data
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ * Using restart indices can result in ~10% smaller index buffers, but on some GPUs restart indices may result in decreased performance.
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_stripifyBound
+ * restart_index should be 0xffff or 0xffffffff depending on index size, or 0 to use degenerate triangles
+ */
+MESHOPTIMIZER_API size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_stripifyBound(size_t index_count);
+
+/**
+ * Mesh unstripifier
+ * Converts a triangle strip to a triangle list
+ * Returns the number of indices in the resulting list, with destination containing new index data
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_unstripifyBound
+ */
+MESHOPTIMIZER_API size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_unstripifyBound(size_t index_count);
+
+struct meshopt_VertexCacheStatistics
+{
+	unsigned int vertices_transformed;
+	unsigned int warps_executed;
+	float acmr; /* transformed vertices / triangle count; best case 0.5, worst case 3.0, optimum depends on topology */
+	float atvr; /* transformed vertices / vertex count; best case 1.0, worst case 6.0, optimum is 1.0 (each vertex is transformed once) */
+};
+
+/**
+ * Vertex transform cache analyzer
+ * Returns cache hit statistics using a simplified FIFO model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+
+struct meshopt_OverdrawStatistics
+{
+	unsigned int pixels_covered;
+	unsigned int pixels_shaded;
+	float overdraw; /* shaded pixels / covered pixels; best case 1.0 */
+};
+
+/**
+ * Overdraw analyzer
+ * Returns overdraw statistics using a software rasterizer
+ * Results may not match actual GPU performance
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+struct meshopt_VertexFetchStatistics
+{
+	unsigned int bytes_fetched;
+	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
+struct meshopt_Meshlet
+{
+	unsigned int vertices[64];
+	unsigned char indices[126][3];
+	unsigned char triangle_count;
+	unsigned char vertex_count;
+};
+
+/**
+ * Experimental: Meshlet builder
+ * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
+ * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ *
+ * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
+
+struct meshopt_Bounds
+{
+	/* bounding sphere, useful for frustum and occlusion culling */
+	float center[3];
+	float radius;
+
+	/* normal cone, useful for backface culling */
+	float cone_apex[3];
+	float cone_axis[3];
+	float cone_cutoff; /* = cos(angle/2) */
+
+	/* normal cone axis and cutoff, stored in 8-bit SNORM format; decode using x/127.0 */
+	signed char cone_axis_s8[3];
+	signed char cone_cutoff_s8;
+};
+
+/**
+ * Experimental: Cluster bounds generator
+ * Creates bounding volumes that can be used for frustum, backface and occlusion culling.
+ *
+ * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
+ *   dot(view, cone_axis) >= cone_cutoff
+ *
+ * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
+ *   dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
+ *
+ * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
+ *   dot(normalize(center - camera_position), cone_axis) >= cone_cutoff + radius / length(center - camera_position)
+ * or an equivalent formula that doesn't have a singularity at center = camera_position:
+ *   dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
+ *
+ * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Generates a remap table that can be used to reorder points for spatial locality.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Set allocation callbacks
+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
+ * Note that all algorithms only allocate memory for temporary use.
+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
+ */
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+/* Quantization into commonly supported data formats */
+#ifdef __cplusplus
+/**
+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
+ * Maximum reconstruction error: 1/2^(N+1)
+ */
+inline int meshopt_quantizeUnorm(float v, int N);
+
+/**
+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
+ * Maximum reconstruction error: 1/2^N
+ */
+inline int meshopt_quantizeSnorm(float v, int N);
+
+/**
+ * Quantize a float into half-precision floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+inline unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+inline float meshopt_quantizeFloat(float v, int N);
+#endif
+
+/**
+ * C++ template interface
+ *
+ * These functions mirror the C interface the library provides, providing template-based overloads so that
+ * the caller can use an arbitrary type for the index data, both for input and output.
+ * When the supplied type is the same size as that of unsigned int, the wrappers are zero-cost; when it's not,
+ * the wrappers end up allocating memory and copying index data to convert from one type to another.
+ */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+#endif
+
+/* Inline implementation */
+#ifdef __cplusplus
+inline int meshopt_quantizeUnorm(float v, int N)
+{
+	const float scale = float((1 << N) - 1);
+
+	v = (v >= 0) ? v : 0;
+	v = (v <= 1) ? v : 1;
+
+	return int(v * scale + 0.5f);
+}
+
+inline int meshopt_quantizeSnorm(float v, int N)
+{
+	const float scale = float((1 << (N - 1)) - 1);
+
+	float round = (v >= 0 ? 0.5f : -0.5f);
+
+	v = (v >= -1) ? v : -1;
+	v = (v <= +1) ? v : +1;
+
+	return int(v * scale + round);
+}
+
+inline unsigned short meshopt_quantizeHalf(float v)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	/* underflow: flush to zero; 113 encodes exponent -14 */
+	h = (em < (113 << 23)) ? 0 : h;
+
+	/* overflow: infinity; 143 encodes exponent 16 */
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	/* NaN; note that we convert all types of NaN to qNaN */
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+inline float meshopt_quantizeFloat(float v, int N)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
+	ui = e == 0x7f800000 ? ui : rui;
+
+	/* flush denormals to zero */
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}
+#endif
+
+/* Internal implementation helpers */
+#ifdef __cplusplus
+class meshopt_Allocator
+{
+public:
+	template <typename T>
+	struct StorageT
+	{
+		static void* (*allocate)(size_t);
+		static void (*deallocate)(void*);
+	};
+
+	typedef StorageT<void> Storage;
+
+	meshopt_Allocator()
+		: blocks()
+		, count(0)
+	{
+	}
+
+	~meshopt_Allocator()
+	{
+		for (size_t i = count; i > 0; --i)
+			Storage::deallocate(blocks[i - 1]);
+	}
+
+	template <typename T> T* allocate(size_t size)
+	{
+		assert(count < sizeof(blocks) / sizeof(blocks[0]));
+		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+		blocks[count++] = result;
+		return result;
+	}
+
+private:
+	void* blocks[24];
+	size_t count;
+};
+
+// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
+template <typename T> void* (*meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
+template <typename T> void (*meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
+#endif
+
+/* Inline implementation for C++ templated wrappers */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T, bool ZeroCopy = sizeof(T) == sizeof(unsigned int)>
+struct meshopt_IndexAdapter;
+
+template <typename T>
+struct meshopt_IndexAdapter<T, false>
+{
+	T* result;
+	unsigned int* data;
+	size_t count;
+
+	meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
+	    : result(result_)
+	    , data(0)
+	    , count(count_)
+	{
+		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
+
+		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
+
+		if (input)
+		{
+			for (size_t i = 0; i < count; ++i)
+				data[i] = input[i];
+		}
+	}
+
+	~meshopt_IndexAdapter()
+	{
+		if (result)
+		{
+			for (size_t i = 0; i < count; ++i)
+				result[i] = T(data[i]);
+		}
+
+		meshopt_Allocator::Storage::deallocate(data);
+	}
+};
+
+template <typename T>
+struct meshopt_IndexAdapter<T, true>
+{
+	unsigned int* data;
+
+	meshopt_IndexAdapter(T* result, const T* input, size_t)
+	    : data(reinterpret_cast<unsigned int*>(result ? result : const_cast<T*>(input)))
+	{
+	}
+};
+
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
+}
+
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
+
+	return meshopt_optimizeVertexFetch(destination, inout.data, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+	(void)index_size_valid;
+
+	return meshopt_decodeIndexBuffer(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+	(void)index_size_valid;
+
+	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, target_index_count);
+
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count);
+}
+
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 5);
+
+	return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3);
+
+	return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+}
+
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+}
+
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+#endif
+
+/**
+ * Copyright (c) 2016-2020 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/thirdparty/meshoptimizer/overdrawanalyzer.cpp b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
new file mode 100644
index 0000000000..8d5859ba39
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
@@ -0,0 +1,230 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Nicolas Capens. Advanced Rasterization. 2004
+namespace meshopt
+{
+
+const int kViewport = 256;
+
+struct OverdrawBuffer
+{
+	float z[kViewport][kViewport][2];
+	unsigned int overdraw[kViewport][kViewport][2];
+};
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
+{
+	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
+	// z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
+	// (x2-x1 y2-y1)(dzdx) = (z2-z1)
+	// (x3-x1 y3-y1)(dzdy)   (z3-z1)
+	// we'll solve it with Cramer's rule
+	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
+	float invdet = (det == 0) ? 0 : 1 / det;
+
+	dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
+	dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+
+	return det;
+}
+
+// half-space fixed point triangle rasterizer
+static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
+{
+	// compute depth gradients
+	float DZx, DZy;
+	float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+	int sign = det > 0;
+
+	// flip backfacing triangles to simplify rasterization logic
+	if (sign)
+	{
+		// flipping v2 & v3 preserves depth gradients since they're based on v1
+		float t;
+		t = v2x, v2x = v3x, v3x = t;
+		t = v2y, v2y = v3y, v3y = t;
+		t = v2z, v2z = v3z, v3z = t;
+
+		// flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
+		v1z = kViewport - v1z;
+		DZx = -DZx;
+		DZy = -DZy;
+	}
+
+	// coordinates, 28.4 fixed point
+	int X1 = int(16.0f * v1x + 0.5f);
+	int X2 = int(16.0f * v2x + 0.5f);
+	int X3 = int(16.0f * v3x + 0.5f);
+
+	int Y1 = int(16.0f * v1y + 0.5f);
+	int Y2 = int(16.0f * v2y + 0.5f);
+	int Y3 = int(16.0f * v3y + 0.5f);
+
+	// bounding rectangle, clipped against viewport
+	// since we rasterize pixels with covered centers, min >0.5 should round up
+	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
+	// so max >= 0.5 should round down
+	int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
+	int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
+	int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
+	int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+
+	// deltas, 28.4 fixed point
+	int DX12 = X1 - X2;
+	int DX23 = X2 - X3;
+	int DX31 = X3 - X1;
+
+	int DY12 = Y1 - Y2;
+	int DY23 = Y2 - Y3;
+	int DY31 = Y3 - Y1;
+
+	// fill convention correction
+	int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
+	int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
+	int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
+
+	// half edge equations, 24.8 fixed point
+	// note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
+	int FX = (minx << 4) + 8;
+	int FY = (miny << 4) + 8;
+	int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
+	int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
+	int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
+	float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
+
+	for (int y = miny; y < maxy; y++)
+	{
+		int CX1 = CY1;
+		int CX2 = CY2;
+		int CX3 = CY3;
+		float ZX = ZY;
+
+		for (int x = minx; x < maxx; x++)
+		{
+			// check if all CXn are non-negative
+			if ((CX1 | CX2 | CX3) >= 0)
+			{
+				if (ZX >= buffer->z[y][x][sign])
+				{
+					buffer->z[y][x][sign] = ZX;
+					buffer->overdraw[y][x][sign]++;
+				}
+			}
+
+			// signed left shift is UB for negative numbers so use unsigned-signed casts
+			CX1 -= int(unsigned(DY12) << 4);
+			CX2 -= int(unsigned(DY23) << 4);
+			CX3 -= int(unsigned(DY31) << 4);
+			ZX += DZx;
+		}
+
+		// signed left shift is UB for negative numbers so use unsigned-signed casts
+		CY1 += int(unsigned(DX12) << 4);
+		CY2 += int(unsigned(DX23) << 4);
+		CY3 += int(unsigned(DX31) << 4);
+		ZY += DZy;
+	}
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_OverdrawStatistics result = {};
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			minv[j] = min(minv[j], v[j]);
+			maxv[j] = max(maxv[j], v[j]);
+		}
+	}
+
+	float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
+	float scale = kViewport / extent;
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		const float* v = vertex_positions + index * vertex_stride_float;
+
+		triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
+		triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
+		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
+	}
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+
+		for (size_t i = 0; i < index_count; i += 3)
+		{
+			const float* vn0 = &triangles[3 * (i + 0)];
+			const float* vn1 = &triangles[3 * (i + 1)];
+			const float* vn2 = &triangles[3 * (i + 2)];
+
+			switch (axis)
+			{
+			case 0:
+				rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+				break;
+			case 1:
+				rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+				break;
+			case 2:
+				rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+				break;
+			}
+		}
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				for (int s = 0; s < 2; ++s)
+				{
+					unsigned int overdraw = buffer->overdraw[y][x][s];
+
+					result.pixels_covered += overdraw > 0;
+					result.pixels_shaded += overdraw;
+				}
+	}
+
+	result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/overdrawoptimizer.cpp b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
new file mode 100644
index 0000000000..143656ed76
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
@@ -0,0 +1,333 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float mesh_centroid[3] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		const float* p = vertex_positions + vertex_stride_float * indices[i];
+
+		mesh_centroid[0] += p[0];
+		mesh_centroid[1] += p[1];
+		mesh_centroid[2] += p[2];
+	}
+
+	mesh_centroid[0] /= index_count;
+	mesh_centroid[1] /= index_count;
+	mesh_centroid[2] /= index_count;
+
+	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
+	{
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		float cluster_area = 0;
+		float cluster_centroid[3] = {};
+		float cluster_normal[3] = {};
+
+		for (size_t i = cluster_begin; i < cluster_end; i += 3)
+		{
+			const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
+			const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
+			const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
+
+			float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+			float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+			float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+			float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+			float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+			float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+			cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
+			cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
+			cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
+			cluster_normal[0] += normalx;
+			cluster_normal[1] += normaly;
+			cluster_normal[2] += normalz;
+			cluster_area += area;
+		}
+
+		float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
+
+		cluster_centroid[0] *= inv_cluster_area;
+		cluster_centroid[1] *= inv_cluster_area;
+		cluster_centroid[2] *= inv_cluster_area;
+
+		float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
+		float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
+
+		cluster_normal[0] *= inv_cluster_normal_length;
+		cluster_normal[1] *= inv_cluster_normal_length;
+		cluster_normal[2] *= inv_cluster_normal_length;
+
+		float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
+
+		sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
+	}
+}
+
+static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
+{
+	// compute sort data bounds and renormalize, using fixed point snorm
+	float sort_data_max = 1e-3f;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		float dpa = fabsf(sort_data[i]);
+
+		sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
+	}
+
+	const int sort_bits = 11;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		// note that we flip distribution since high dot product should come first
+		float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
+
+		sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
+	}
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		histogram[sort_keys[i]]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == cluster_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		sort_order[histogram[sort_keys[i]]++] = unsigned(i);
+	}
+}
+
+static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
+{
+	unsigned int cache_misses = 0;
+
+	// if vertex is not in cache, put it in cache
+	if (timestamp - cache_timestamps[a] > cache_size)
+	{
+		cache_timestamps[a] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[b] > cache_size)
+	{
+		cache_timestamps[b] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[c] > cache_size)
+	{
+		cache_timestamps[c] = timestamp++;
+		cache_misses++;
+	}
+
+	return cache_misses;
+}
+
+static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	size_t face_count = index_count / 3;
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+		// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
+		// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
+		// suggests an inefficiency in the vertex cache optimization algorithm
+		// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
+		if (i == 0 || m == 3)
+		{
+			destination[result++] = unsigned(i);
+		}
+	}
+
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = 0;
+
+	size_t result = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		size_t start = clusters[it];
+		size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
+		assert(start < end);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		// measure cluster ACMR
+		unsigned int cluster_misses = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			cluster_misses += m;
+		}
+
+		float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
+
+		// first cluster always starts from the hard cluster boundary
+		destination[result++] = unsigned(start);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		unsigned int running_misses = 0;
+		unsigned int running_faces = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			running_misses += m;
+			running_faces += 1;
+
+			if (float(running_misses) / float(running_faces) <= cluster_threshold)
+			{
+				// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
+				// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
+				// cluster is empty; however, the 'pop_back' after the loop will clean it up
+				destination[result++] = unsigned(i + 1);
+
+				// reset cache
+				timestamp += cache_size + 1;
+
+				running_misses = 0;
+				running_faces = 0;
+			}
+		}
+
+		// each time we reach the target ACMR we flush the cluster
+		// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
+		// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
+		// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
+		// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
+		// to the cluster boundary array which we need to remove anyway - this code will do that automatically
+		if (destination[result - 1] != start)
+		{
+			result--;
+		}
+	}
+
+	assert(result >= cluster_count);
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+
+	// generate hard boundaries from full-triangle cache misses
+	unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
+	size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
+
+	// generate soft boundaries
+	unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
+	size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
+
+	const unsigned int* clusters = soft_clusters;
+	size_t cluster_count = soft_cluster_count;
+
+	// fill sort data
+	float* sort_data = allocator.allocate<float>(cluster_count);
+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+
+	// sort clusters using sort data
+	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
+	unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
+	calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
+
+	// fill output buffer
+	size_t offset = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		unsigned int cluster = sort_order[it];
+		assert(cluster < cluster_count);
+
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
+		offset += cluster_end - cluster_begin;
+	}
+
+	assert(offset == index_count);
+}
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
new file mode 100644
index 0000000000..bd523275ce
--- /dev/null
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -0,0 +1,1529 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
+// Michael Garland. Quadric-based polygonal surface simplification. 1999
+// Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
+// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
+namespace meshopt
+{
+
+struct EdgeAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill edge counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill edge data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = b;
+		adjacency.data[adjacency.offsets[b]++] = c;
+		adjacency.data[adjacency.offsets[c]++] = a;
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+struct PositionHasher
+{
+	const float* vertex_positions;
+	size_t vertex_stride_float;
+
+	size_t hash(unsigned int index) const
+	{
+		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (key[0] * 73856093) ^ (key[1] * 19349663) ^ (key[2] * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertex_positions + lhs * vertex_stride_float, vertex_positions + rhs * vertex_stride_float, sizeof(float) * 3) == 0;
+	}
+};
+
+static size_t hashBuckets2(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup2(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return 0;
+}
+
+static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+	PositionHasher hasher = {vertex_positions_data, vertex_positions_stride / sizeof(float)};
+
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	// build forward remap: for each vertex, which other (canonical) vertex does it map to?
+	// we use position equivalence for this, and remap vertices to other existing vertices
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int index = unsigned(i);
+		unsigned int* entry = hashLookup2(table, table_size, hasher, index, ~0u);
+
+		if (*entry == ~0u)
+			*entry = index;
+
+		remap[index] = *entry;
+	}
+
+	// build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
+	// entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
+	for (size_t i = 0; i < vertex_count; ++i)
+		wedge[i] = unsigned(i);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] != i)
+		{
+			unsigned int r = remap[i];
+
+			wedge[i] = wedge[r];
+			wedge[r] = unsigned(i);
+		}
+}
+
+enum VertexKind
+{
+	Kind_Manifold, // not on an attribute seam, not on any boundary
+	Kind_Border,   // not on an attribute seam, has exactly two open edges
+	Kind_Seam,     // on an attribute seam with exactly two attribute seam edges
+	Kind_Complex,  // none of the above; these vertices can move as long as all wedges move to the target vertex
+	Kind_Locked,   // none of the above; these vertices can't move
+
+	Kind_Count
+};
+
+// manifold vertices can collapse onto anything
+// border/seam vertices can only be collapsed onto border/seam respectively
+// complex vertices can collapse onto complex/locked
+// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
+// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
+const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
+    {1, 1, 1, 1, 1},
+    {0, 1, 0, 0, 0},
+    {0, 0, 1, 0, 0},
+    {0, 0, 0, 1, 1},
+    {0, 0, 0, 0, 0},
+};
+
+// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
+// note that for seam edges, the opposite edge isn't present in the attribute-based topology
+// but is present if you consider a position-only mesh variant
+const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
+    {1, 1, 1, 0, 1},
+    {1, 0, 1, 0, 0},
+    {1, 1, 1, 0, 1},
+    {0, 0, 0, 0, 0},
+    {1, 0, 1, 0, 0},
+};
+
+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b)
+{
+	unsigned int count = adjacency.counts[a];
+	const unsigned int* data = adjacency.data + adjacency.offsets[a];
+
+	for (size_t i = 0; i < count; ++i)
+		if (data[i] == b)
+			return true;
+
+	return false;
+}
+
+static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge)
+{
+	memset(loop, -1, vertex_count * sizeof(unsigned int));
+	memset(loopback, -1, vertex_count * sizeof(unsigned int));
+
+	// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
+	// note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
+	// but here it's okay to fill the data out for other types of vertices as well
+	unsigned int* openinc = loopback;
+	unsigned int* openout = loop;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int vertex = unsigned(i);
+
+		unsigned int count = adjacency.counts[vertex];
+		const unsigned int* data = adjacency.data + adjacency.offsets[vertex];
+
+		for (size_t j = 0; j < count; ++j)
+		{
+			unsigned int target = data[j];
+
+			if (!hasEdge(adjacency, target, vertex))
+			{
+				openinc[target] = (openinc[target] == ~0u) ? vertex : target;
+				openout[vertex] = (openout[vertex] == ~0u) ? target : vertex;
+			}
+		}
+	}
+
+#if TRACE
+	size_t lockedstats[4] = {};
+#define TRACELOCKED(i) lockedstats[i]++;
+#else
+#define TRACELOCKED(i) (void)0
+#endif
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] == i)
+		{
+			if (wedge[i] == i)
+			{
+				// no attribute seam, need to check if it's manifold
+				unsigned int openi = openinc[i], openo = openout[i];
+
+				// note: we classify any vertices with no open edges as manifold
+				// this is technically incorrect - if 4 triangles share an edge, we'll classify vertices as manifold
+				// it's unclear if this is a problem in practice
+				if (openi == ~0u && openo == ~0u)
+				{
+					result[i] = Kind_Manifold;
+				}
+				else if (openi != i && openo != i)
+				{
+					result[i] = Kind_Border;
+				}
+				else
+				{
+					result[i] = Kind_Locked;
+					TRACELOCKED(0);
+				}
+			}
+			else if (wedge[wedge[i]] == i)
+			{
+				// attribute seam; need to distinguish between Seam and Locked
+				unsigned int w = wedge[i];
+				unsigned int openiv = openinc[i], openov = openout[i];
+				unsigned int openiw = openinc[w], openow = openout[w];
+
+				// seam should have one open half-edge for each vertex, and the edges need to "connect" - point to the same vertex post-remap
+				if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
+				    openiw != ~0u && openiw != w && openow != ~0u && openow != w)
+				{
+					if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw])
+					{
+						result[i] = Kind_Seam;
+					}
+					else
+					{
+						result[i] = Kind_Locked;
+						TRACELOCKED(1);
+					}
+				}
+				else
+				{
+					result[i] = Kind_Locked;
+					TRACELOCKED(2);
+				}
+			}
+			else
+			{
+				// more than one vertex maps to this one; we don't have classification available
+				result[i] = Kind_Locked;
+				TRACELOCKED(3);
+			}
+		}
+		else
+		{
+			assert(remap[i] < i);
+
+			result[i] = result[remap[i]];
+		}
+	}
+
+#if TRACE
+	printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n",
+	       int(lockedstats[0]), int(lockedstats[1]), int(lockedstats[2]), int(lockedstats[3]));
+#endif
+}
+
+struct Vector3
+{
+	float x, y, z;
+};
+
+static void rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		result[i].x = v[0];
+		result[i].y = v[1];
+		result[i].z = v[2];
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = extent == 0 ? 0.f : 1.f / extent;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		result[i].x = (result[i].x - minv[0]) * scale;
+		result[i].y = (result[i].y - minv[1]) * scale;
+		result[i].z = (result[i].z - minv[2]) * scale;
+	}
+}
+
+struct Quadric
+{
+	float a00, a11, a22;
+	float a10, a20, a21;
+	float b0, b1, b2, c;
+	float w;
+};
+
+struct Collapse
+{
+	unsigned int v0;
+	unsigned int v1;
+
+	union
+	{
+		unsigned int bidi;
+		float error;
+		unsigned int errorui;
+	};
+};
+
+static float normalize(Vector3& v)
+{
+	float length = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+
+	if (length > 0)
+	{
+		v.x /= length;
+		v.y /= length;
+		v.z /= length;
+	}
+
+	return length;
+}
+
+static void quadricAdd(Quadric& Q, const Quadric& R)
+{
+	Q.a00 += R.a00;
+	Q.a11 += R.a11;
+	Q.a22 += R.a22;
+	Q.a10 += R.a10;
+	Q.a20 += R.a20;
+	Q.a21 += R.a21;
+	Q.b0 += R.b0;
+	Q.b1 += R.b1;
+	Q.b2 += R.b2;
+	Q.c += R.c;
+	Q.w += R.w;
+}
+
+static float quadricError(const Quadric& Q, const Vector3& v)
+{
+	float rx = Q.b0;
+	float ry = Q.b1;
+	float rz = Q.b2;
+
+	rx += Q.a10 * v.y;
+	ry += Q.a21 * v.z;
+	rz += Q.a20 * v.x;
+
+	rx *= 2;
+	ry *= 2;
+	rz *= 2;
+
+	rx += Q.a00 * v.x;
+	ry += Q.a11 * v.y;
+	rz += Q.a22 * v.z;
+
+	float r = Q.c;
+	r += rx * v.x;
+	r += ry * v.y;
+	r += rz * v.z;
+
+	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
+
+	return fabsf(r) * s;
+}
+
+static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
+{
+	float aw = a * w;
+	float bw = b * w;
+	float cw = c * w;
+	float dw = d * w;
+
+	Q.a00 = a * aw;
+	Q.a11 = b * bw;
+	Q.a22 = c * cw;
+	Q.a10 = a * bw;
+	Q.a20 = a * cw;
+	Q.a21 = b * cw;
+	Q.b0 = a * dw;
+	Q.b1 = b * dw;
+	Q.b2 = c * dw;
+	Q.c = d * dw;
+	Q.w = w;
+}
+
+static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+{
+	// we need to encode (x - X) ^ 2 + (y - Y)^2 + (z - Z)^2 into the quadric
+	Q.a00 = w;
+	Q.a11 = w;
+	Q.a22 = w;
+	Q.a10 = 0.f;
+	Q.a20 = 0.f;
+	Q.a21 = 0.f;
+	Q.b0 = -2.f * x * w;
+	Q.b1 = -2.f * y * w;
+	Q.b2 = -2.f * z * w;
+	Q.c = (x * x + y * y + z * z) * w;
+	Q.w = w;
+}
+
+static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+	// normal = cross(p1 - p0, p2 - p0)
+	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+	float area = normalize(normal);
+
+	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+	// we use sqrtf(area) so that the error is scaled linearly; this tends to improve silhouettes
+	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, sqrtf(area) * weight);
+}
+
+static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	float length = normalize(p10);
+
+	// p20p = length of projection of p2-p0 onto normalize(p1 - p0)
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+	float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
+
+	// normal = altitude of triangle from point p2 onto edge p1-p0
+	Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p};
+	normalize(normal);
+
+	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+	// note: the weight is scaled linearly with edge length; this has to match the triangle weight
+	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+}
+
+static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int i0 = indices[i + 0];
+		unsigned int i1 = indices[i + 1];
+		unsigned int i2 = indices[i + 2];
+
+		Quadric Q;
+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
+
+		quadricAdd(vertex_quadrics[remap[i0]], Q);
+		quadricAdd(vertex_quadrics[remap[i1]], Q);
+		quadricAdd(vertex_quadrics[remap[i2]], Q);
+	}
+}
+
+static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			// check that either i0 or i1 are border/seam and are on the same edge loop
+			// note that we need to add the error even for edged that connect e.g. border & locked
+			// if we don't do that, the adjacent border->border edge won't have correct errors for corners
+			if (k0 != Kind_Border && k0 != Kind_Seam && k1 != Kind_Border && k1 != Kind_Seam)
+				continue;
+
+			if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+				continue;
+
+			if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
+				continue;
+
+			// seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+				continue;
+
+			unsigned int i2 = indices[i + next[next[e]]];
+
+			// we try hard to maintain border edge geometry; seam edges can move more freely
+			// due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
+			const float kEdgeWeightSeam = 1.f;
+			const float kEdgeWeightBorder = 10.f;
+
+			float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
+
+			Quadric Q;
+			quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+
+			quadricAdd(vertex_quadrics[remap[i0]], Q);
+			quadricAdd(vertex_quadrics[remap[i1]], Q);
+		}
+	}
+}
+
+static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+{
+	size_t collapse_count = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			// this can happen either when input has a zero-length edge, or when we perform collapses for complex
+			// topology w/seams and collapse a manifold vertex that connects to both wedges onto one of them
+			// we leave edges like this alone since they may be important for preserving mesh integrity
+			if (remap[i0] == remap[i1])
+				continue;
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			// the edge has to be collapsible in at least one direction
+			if (!(kCanCollapse[k0][k1] | kCanCollapse[k1][k0]))
+				continue;
+
+			// manifold and seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+				continue;
+
+			// two vertices are on a border or a seam, but there's no direct edge between them
+			// this indicates that they belong to two different edge loops and we should not collapse this edge
+			// loop[] tracks half edges so we only need to check i0->i1
+			if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+				continue;
+
+			// edge can be collapsed in either direction - we will pick the one with minimum error
+			// note: we evaluate error later during collapse ranking, here we just tag the edge as bidirectional
+			if (kCanCollapse[k0][k1] & kCanCollapse[k1][k0])
+			{
+				Collapse c = {i0, i1, {/* bidi= */ 1}};
+				collapses[collapse_count++] = c;
+			}
+			else
+			{
+				// edge can only be collapsed in one direction
+				unsigned int e0 = kCanCollapse[k0][k1] ? i0 : i1;
+				unsigned int e1 = kCanCollapse[k0][k1] ? i1 : i0;
+
+				Collapse c = {e0, e1, {/* bidi= */ 0}};
+				collapses[collapse_count++] = c;
+			}
+		}
+	}
+
+	return collapse_count;
+}
+
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap)
+{
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		Collapse& c = collapses[i];
+
+		unsigned int i0 = c.v0;
+		unsigned int i1 = c.v1;
+
+		// most edges are bidirectional which means we need to evaluate errors for two collapses
+		// to keep this code branchless we just use the same edge for unidirectional edges
+		unsigned int j0 = c.bidi ? i1 : i0;
+		unsigned int j1 = c.bidi ? i0 : i1;
+
+		const Quadric& qi = vertex_quadrics[remap[i0]];
+		const Quadric& qj = vertex_quadrics[remap[j0]];
+
+		float ei = quadricError(qi, vertex_positions[i1]);
+		float ej = quadricError(qj, vertex_positions[j1]);
+
+		// pick edge direction with minimal error
+		c.v0 = ei <= ej ? i0 : j0;
+		c.v1 = ei <= ej ? i1 : j1;
+		c.error = ei <= ej ? ei : ej;
+	}
+}
+
+#if TRACE > 1
+static void dumpEdgeCollapses(const Collapse* collapses, size_t collapse_count, const unsigned char* vertex_kind)
+{
+	size_t ckinds[Kind_Count][Kind_Count] = {};
+	float cerrors[Kind_Count][Kind_Count] = {};
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			cerrors[k0][k1] = FLT_MAX;
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		unsigned int i0 = collapses[i].v0;
+		unsigned int i1 = collapses[i].v1;
+
+		unsigned char k0 = vertex_kind[i0];
+		unsigned char k1 = vertex_kind[i1];
+
+		ckinds[k0][k1]++;
+		cerrors[k0][k1] = (collapses[i].error < cerrors[k0][k1]) ? collapses[i].error : cerrors[k0][k1];
+	}
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			if (ckinds[k0][k1])
+				printf("collapses %d -> %d: %d, min error %e\n", k0, k1, int(ckinds[k0][k1]), cerrors[k0][k1]);
+}
+
+static void dumpLockedCollapses(const unsigned int* indices, size_t index_count, const unsigned char* vertex_kind)
+{
+	size_t locked_collapses[Kind_Count][Kind_Count] = {};
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			locked_collapses[k0][k1] += !kCanCollapse[k0][k1] && !kCanCollapse[k1][k0];
+		}
+	}
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			if (locked_collapses[k0][k1])
+				printf("locked collapses %d -> %d: %d\n", k0, k1, int(locked_collapses[k0][k1]));
+}
+#endif
+
+static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
+{
+	const int sort_bits = 11;
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		// skip sign bit since error is non-negative
+		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+		histogram[key]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == collapse_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		// skip sign bit since error is non-negative
+		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+		sort_order[histogram[key]++] = unsigned(i);
+	}
+}
+
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, size_t triangle_collapse_goal, float error_goal, float error_limit)
+{
+	size_t edge_collapses = 0;
+	size_t triangle_collapses = 0;
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		const Collapse& c = collapses[collapse_order[i]];
+
+		if (c.error > error_limit)
+			break;
+
+		if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 10)
+			break;
+
+		if (triangle_collapses >= triangle_collapse_goal)
+			break;
+
+		unsigned int i0 = c.v0;
+		unsigned int i1 = c.v1;
+
+		unsigned int r0 = remap[i0];
+		unsigned int r1 = remap[i1];
+
+		// we don't collapse vertices that had source or target vertex involved in a collapse
+		// it's important to not move the vertices twice since it complicates the tracking/remapping logic
+		// it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
+		if (collapse_locked[r0] | collapse_locked[r1])
+			continue;
+
+		assert(collapse_remap[r0] == r0);
+		assert(collapse_remap[r1] == r1);
+
+		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+		if (vertex_kind[i0] == Kind_Complex)
+		{
+			unsigned int v = i0;
+
+			do
+			{
+				collapse_remap[v] = r1;
+				v = wedge[v];
+			} while (v != i0);
+		}
+		else if (vertex_kind[i0] == Kind_Seam)
+		{
+			// remap v0 to v1 and seam pair of v0 to seam pair of v1
+			unsigned int s0 = wedge[i0];
+			unsigned int s1 = wedge[i1];
+
+			assert(s0 != i0 && s1 != i1);
+			assert(wedge[s0] == i0 && wedge[s1] == i1);
+
+			collapse_remap[i0] = i1;
+			collapse_remap[s0] = s1;
+		}
+		else
+		{
+			assert(wedge[i0] == i0);
+
+			collapse_remap[i0] = i1;
+		}
+
+		collapse_locked[r0] = 1;
+		collapse_locked[r1] = 1;
+
+		// border edges collapse 1 triangle, other edges collapse 2 or more
+		triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+		edge_collapses++;
+	}
+
+	return edge_collapses;
+}
+
+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
+{
+	size_t write = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int v0 = collapse_remap[indices[i + 0]];
+		unsigned int v1 = collapse_remap[indices[i + 1]];
+		unsigned int v2 = collapse_remap[indices[i + 2]];
+
+		// we never move the vertex twice during a single pass
+		assert(collapse_remap[v0] == v0);
+		assert(collapse_remap[v1] == v1);
+		assert(collapse_remap[v2] == v2);
+
+		if (v0 != v1 && v0 != v2 && v1 != v2)
+		{
+			indices[write + 0] = v0;
+			indices[write + 1] = v1;
+			indices[write + 2] = v2;
+			write += 3;
+		}
+	}
+
+	return write;
+}
+
+static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsigned int* collapse_remap)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (loop[i] != ~0u)
+		{
+			unsigned int l = loop[i];
+			unsigned int r = collapse_remap[l];
+
+			// i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
+			loop[i] = (i == r) ? loop[l] : r;
+		}
+	}
+}
+
+struct CellHasher
+{
+	const unsigned int* vertex_ids;
+
+	size_t hash(unsigned int i) const
+	{
+		unsigned int h = vertex_ids[i];
+
+		// MurmurHash2 finalizer
+		h ^= h >> 13;
+		h *= 0x5bd1e995;
+		h ^= h >> 15;
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return vertex_ids[lhs] == vertex_ids[rhs];
+	}
+};
+
+struct IdHasher
+{
+	size_t hash(unsigned int id) const
+	{
+		unsigned int h = id;
+
+		// MurmurHash2 finalizer
+		h ^= h >> 13;
+		h *= 0x5bd1e995;
+		h ^= h >> 15;
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return lhs == rhs;
+	}
+};
+
+struct TriangleHasher
+{
+	unsigned int* indices;
+
+	size_t hash(unsigned int i) const
+	{
+		const unsigned int* tri = indices + i * 3;
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (tri[0] * 73856093) ^ (tri[1] * 19349663) ^ (tri[2] * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		const unsigned int* lt = indices + lhs * 3;
+		const unsigned int* rt = indices + rhs * 3;
+
+		return lt[0] == rt[0] && lt[1] == rt[1] && lt[2] == rt[2];
+	}
+};
+
+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
+{
+	assert(grid_size >= 1 && grid_size <= 1024);
+	float cell_scale = float(grid_size - 1);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const Vector3& v = vertex_positions[i];
+
+		int xi = int(v.x * cell_scale + 0.5f);
+		int yi = int(v.y * cell_scale + 0.5f);
+		int zi = int(v.z * cell_scale + 0.5f);
+
+		vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
+	}
+}
+
+static size_t countTriangles(const unsigned int* vertex_ids, const unsigned int* indices, size_t index_count)
+{
+	size_t result = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int id0 = vertex_ids[indices[i + 0]];
+		unsigned int id1 = vertex_ids[indices[i + 1]];
+		unsigned int id2 = vertex_ids[indices[i + 2]];
+
+		result += (id0 != id1) & (id0 != id2) & (id1 != id2);
+	}
+
+	return result;
+}
+
+static size_t fillVertexCells(unsigned int* table, size_t table_size, unsigned int* vertex_cells, const unsigned int* vertex_ids, size_t vertex_count)
+{
+	CellHasher hasher = {vertex_ids};
+
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int* entry = hashLookup2(table, table_size, hasher, unsigned(i), ~0u);
+
+		if (*entry == ~0u)
+		{
+			*entry = unsigned(i);
+			vertex_cells[i] = unsigned(result++);
+		}
+		else
+		{
+			vertex_cells[i] = vertex_cells[*entry];
+		}
+	}
+
+	return result;
+}
+
+static size_t countVertexCells(unsigned int* table, size_t table_size, const unsigned int* vertex_ids, size_t vertex_count)
+{
+	IdHasher hasher;
+
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int id = vertex_ids[i];
+		unsigned int* entry = hashLookup2(table, table_size, hasher, id, ~0u);
+
+		result += (*entry == ~0u);
+		*entry = id;
+	}
+
+	return result;
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* vertex_cells)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int i0 = indices[i + 0];
+		unsigned int i1 = indices[i + 1];
+		unsigned int i2 = indices[i + 2];
+
+		unsigned int c0 = vertex_cells[i0];
+		unsigned int c1 = vertex_cells[i1];
+		unsigned int c2 = vertex_cells[i2];
+
+		bool single_cell = (c0 == c1) & (c0 == c2);
+
+		Quadric Q;
+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], single_cell ? 3.f : 1.f);
+
+		if (single_cell)
+		{
+			quadricAdd(cell_quadrics[c0], Q);
+		}
+		else
+		{
+			quadricAdd(cell_quadrics[c0], Q);
+			quadricAdd(cell_quadrics[c1], Q);
+			quadricAdd(cell_quadrics[c2], Q);
+		}
+	}
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* vertex_cells)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int c = vertex_cells[i];
+		const Vector3& v = vertex_positions[i];
+
+		Quadric Q;
+		quadricFromPoint(Q, v.x, v.y, v.z, 1.f);
+
+		quadricAdd(cell_quadrics[c], Q);
+	}
+}
+
+static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count)
+{
+	memset(cell_remap, -1, cell_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int cell = vertex_cells[i];
+		float error = quadricError(cell_quadrics[cell], vertex_positions[i]);
+
+		if (cell_remap[cell] == ~0u || cell_errors[cell] > error)
+		{
+			cell_remap[cell] = unsigned(i);
+			cell_errors[cell] = error;
+		}
+	}
+}
+
+static size_t filterTriangles(unsigned int* destination, unsigned int* tritable, size_t tritable_size, const unsigned int* indices, size_t index_count, const unsigned int* vertex_cells, const unsigned int* cell_remap)
+{
+	TriangleHasher hasher = {destination};
+
+	memset(tritable, -1, tritable_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int c0 = vertex_cells[indices[i + 0]];
+		unsigned int c1 = vertex_cells[indices[i + 1]];
+		unsigned int c2 = vertex_cells[indices[i + 2]];
+
+		if (c0 != c1 && c0 != c2 && c1 != c2)
+		{
+			unsigned int a = cell_remap[c0];
+			unsigned int b = cell_remap[c1];
+			unsigned int c = cell_remap[c2];
+
+			if (b < a && b < c)
+			{
+				unsigned int t = a;
+				a = b, b = c, c = t;
+			}
+			else if (c < a && c < b)
+			{
+				unsigned int t = c;
+				c = b, b = a, a = t;
+			}
+
+			destination[result * 3 + 0] = a;
+			destination[result * 3 + 1] = b;
+			destination[result * 3 + 2] = c;
+
+			unsigned int* entry = hashLookup2(tritable, tritable_size, hasher, unsigned(result), ~0u);
+
+			if (*entry == ~0u)
+				*entry = unsigned(result++);
+		}
+	}
+
+	return result * 3;
+}
+
+static float interpolate(float y, float x0, float y0, float x1, float y1, float x2, float y2)
+{
+	// three point interpolation from "revenge of interpolation search" paper
+	float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
+	float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
+	return x1 + num / den;
+}
+
+} // namespace meshopt
+
+#ifndef NDEBUG
+unsigned char* meshopt_simplifyDebugKind = 0;
+unsigned int* meshopt_simplifyDebugLoop = 0;
+unsigned int* meshopt_simplifyDebugLoopBack = 0;
+#endif
+
+size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_index_count <= index_count);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* result = destination;
+
+	// build adjacency information
+	EdgeAdjacency adjacency = {};
+	buildEdgeAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// build position remap that maps each vertex to the one with identical position
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator);
+
+	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
+	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
+	unsigned int* loop = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count);
+	classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge);
+
+#if TRACE
+	size_t unique_positions = 0;
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_positions += remap[i] == i;
+
+	printf("position remap: %d vertices => %d positions\n", int(vertex_count), int(unique_positions));
+
+	size_t kinds[Kind_Count] = {};
+	for (size_t i = 0; i < vertex_count; ++i)
+		kinds[vertex_kind[i]] += remap[i] == i;
+
+	printf("kinds: manifold %d, border %d, seam %d, complex %d, locked %d\n",
+	       int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked]));
+#endif
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
+	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
+
+	fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap);
+	fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
+
+	if (result != indices)
+		memcpy(result, indices, index_count * sizeof(unsigned int));
+
+#if TRACE
+	size_t pass_count = 0;
+	float worst_error = 0;
+#endif
+
+	Collapse* edge_collapses = allocator.allocate<Collapse>(index_count);
+	unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count);
+	unsigned int* collapse_remap = allocator.allocate<unsigned int>(vertex_count);
+	unsigned char* collapse_locked = allocator.allocate<unsigned char>(vertex_count);
+
+	size_t result_count = index_count;
+
+	// target_error input is linear; we need to adjust it to match quadricError units
+	float error_limit = target_error * target_error;
+
+	while (result_count > target_index_count)
+	{
+		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop);
+
+		// no edges can be collapsed any more due to topology restrictions
+		if (edge_collapse_count == 0)
+			break;
+
+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap);
+
+#if TRACE > 1
+		dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind);
+#endif
+
+		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
+
+		// most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit
+		// note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses
+		size_t triangle_collapse_goal = (result_count - target_index_count) / 3;
+		size_t edge_collapse_goal = triangle_collapse_goal / 2;
+
+		// we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
+		// as they will share vertices with other successfull collapses, we need to increase the acceptable error by this factor
+		const float kPassErrorBound = 1.5f;
+
+		float error_goal = edge_collapse_goal < edge_collapse_count ? edge_collapses[collapse_order[edge_collapse_goal]].error * kPassErrorBound : FLT_MAX;
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			collapse_remap[i] = unsigned(i);
+
+		memset(collapse_locked, 0, vertex_count);
+
+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, triangle_collapse_goal, error_goal, error_limit);
+
+		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
+		if (collapses == 0)
+			break;
+
+		remapEdgeLoops(loop, vertex_count, collapse_remap);
+		remapEdgeLoops(loopback, vertex_count, collapse_remap);
+
+		size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
+		assert(new_count < result_count);
+
+#if TRACE
+		float pass_error = 0.f;
+		for (size_t i = 0; i < edge_collapse_count; ++i)
+		{
+			Collapse& c = edge_collapses[collapse_order[i]];
+
+			if (collapse_remap[c.v0] == c.v1)
+				pass_error = c.error;
+		}
+
+		pass_count++;
+		worst_error = (worst_error < pass_error) ? pass_error : worst_error;
+
+		printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal);
+#endif
+
+		result_count = new_count;
+	}
+
+#if TRACE
+	printf("passes: %d, worst error: %e\n", int(pass_count), worst_error);
+#endif
+
+#if TRACE > 1
+	dumpLockedCollapses(result, result_count, vertex_kind);
+#endif
+
+#ifndef NDEBUG
+	if (meshopt_simplifyDebugKind)
+		memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
+
+	if (meshopt_simplifyDebugLoop)
+		memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int));
+
+	if (meshopt_simplifyDebugLoopBack)
+		memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int));
+#endif
+
+	return result_count;
+}
+
+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_index_count <= index_count);
+
+	// we expect to get ~2 triangles/vertex in the output
+	size_t target_cell_count = target_index_count / 6;
+
+	if (target_cell_count == 0)
+		return 0;
+
+	meshopt_Allocator allocator;
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	// find the optimal grid size using guided binary search
+#if TRACE
+	printf("source: %d vertices, %d triangles\n", int(vertex_count), int(index_count / 3));
+	printf("target: %d cells, %d triangles\n", int(target_cell_count), int(target_index_count / 3));
+#endif
+
+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+	const int kInterpolationPasses = 5;
+
+	// invariant: # of triangles in min_grid <= target_count
+	int min_grid = 0;
+	int max_grid = 1025;
+	size_t min_triangles = 0;
+	size_t max_triangles = index_count / 3;
+
+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+	{
+		assert(min_triangles < target_index_count / 3);
+		assert(max_grid - min_grid > 1);
+
+		// we clamp the prediction of the grid size to make sure that the search converges
+		int grid_size = next_grid_size;
+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		size_t triangles = countTriangles(vertex_ids, indices, index_count);
+
+#if TRACE
+		printf("pass %d (%s): grid size %d, triangles %d, %s\n",
+		       pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+		       grid_size, int(triangles),
+		       (triangles <= target_index_count / 3) ? "under" : "over");
+#endif
+
+		float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+
+		if (triangles <= target_index_count / 3)
+		{
+			min_grid = grid_size;
+			min_triangles = triangles;
+		}
+		else
+		{
+			max_grid = grid_size;
+			max_triangles = triangles;
+		}
+
+		if (triangles == target_index_count / 3 || max_grid - min_grid <= 1)
+			break;
+
+		// we start by using interpolation search - it usually converges faster
+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+	}
+
+	if (min_triangles == 0)
+		return 0;
+
+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+	// build a quadric for each target cell
+	Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+	memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+	fillCellQuadrics(cell_quadrics, indices, index_count, vertex_positions, vertex_cells);
+
+	// for each target cell, find the vertex with the minimal error
+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+	float* cell_errors = allocator.allocate<float>(cell_count);
+
+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+	// collapse triangles!
+	// note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
+	size_t tritable_size = hashBuckets2(min_triangles);
+	unsigned int* tritable = allocator.allocate<unsigned int>(tritable_size);
+
+	size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
+	assert(write <= target_index_count);
+
+#if TRACE
+	printf("result: %d cells, %d triangles (%d unfiltered)\n", int(cell_count), int(write / 3), int(min_triangles));
+#endif
+
+	return write;
+}
+
+size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_vertex_count <= vertex_count);
+
+	size_t target_cell_count = target_vertex_count;
+
+	if (target_cell_count == 0)
+		return 0;
+
+	meshopt_Allocator allocator;
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	// find the optimal grid size using guided binary search
+#if TRACE
+	printf("source: %d vertices\n", int(vertex_count));
+	printf("target: %d cells\n", int(target_cell_count));
+#endif
+
+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+	const int kInterpolationPasses = 5;
+
+	// invariant: # of vertices in min_grid <= target_count
+	int min_grid = 0;
+	int max_grid = 1025;
+	size_t min_vertices = 0;
+	size_t max_vertices = vertex_count;
+
+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+	{
+		assert(min_vertices < target_vertex_count);
+		assert(max_grid - min_grid > 1);
+
+		// we clamp the prediction of the grid size to make sure that the search converges
+		int grid_size = next_grid_size;
+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
+
+#if TRACE
+		printf("pass %d (%s): grid size %d, vertices %d, %s\n",
+		       pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+		       grid_size, int(vertices),
+		       (vertices <= target_vertex_count) ? "under" : "over");
+#endif
+
+		float tip = interpolate(float(target_vertex_count), float(min_grid), float(min_vertices), float(grid_size), float(vertices), float(max_grid), float(max_vertices));
+
+		if (vertices <= target_vertex_count)
+		{
+			min_grid = grid_size;
+			min_vertices = vertices;
+		}
+		else
+		{
+			max_grid = grid_size;
+			max_vertices = vertices;
+		}
+
+		if (vertices == target_vertex_count || max_grid - min_grid <= 1)
+			break;
+
+		// we start by using interpolation search - it usually converges faster
+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+	}
+
+	if (min_vertices == 0)
+		return 0;
+
+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+	// build a quadric for each target cell
+	Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+	memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+	fillCellQuadrics(cell_quadrics, vertex_positions, vertex_count, vertex_cells);
+
+	// for each target cell, find the vertex with the minimal error
+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+	float* cell_errors = allocator.allocate<float>(cell_count);
+
+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+	// copy results to the output
+	assert(cell_count <= target_vertex_count);
+	memcpy(destination, cell_remap, sizeof(unsigned int) * cell_count);
+
+#if TRACE
+	printf("result: %d cells\n", int(cell_count));
+#endif
+
+	return cell_count;
+}
diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp
new file mode 100644
index 0000000000..b09f80ac6f
--- /dev/null
+++ b/thirdparty/meshoptimizer/spatialorder.cpp
@@ -0,0 +1,194 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Fabian Giesen. Decoding Morton codes. 2009
+namespace meshopt
+{
+
+// "Insert" two 0 bits after each of the 10 low bits of x
+inline unsigned int part1By2(unsigned int x)
+{
+	x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
+	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x << 8)) & 0x0300f00f;  // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x << 4)) & 0x030c30c3;  // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x << 2)) & 0x09249249;  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	return x;
+}
+
+static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = extent == 0 ? 0.f : 1.f / extent;
+
+	// generate Morton order based on the position inside a unit cube
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
+		int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
+		int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+
+		result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+	}
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+{
+	memset(hist, 0, sizeof(hist));
+
+	// compute 3 10-bit histograms in parallel
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = data[i];
+
+		hist[(id >> 0) & 1023][0]++;
+		hist[(id >> 10) & 1023][1]++;
+		hist[(id >> 20) & 1023][2]++;
+	}
+
+	unsigned int sumx = 0, sumy = 0, sumz = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 1024; ++i)
+	{
+		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+		hist[i][0] = sumx;
+		hist[i][1] = sumy;
+		hist[i][2] = sumz;
+
+		sumx += hx;
+		sumy += hy;
+		sumz += hz;
+	}
+
+	assert(sumx == count && sumy == count && sumz == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+	int bitoff = pass * 10;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+
+		destination[hist[id][pass]++] = source[i];
+	}
+}
+
+} // namespace meshopt
+
+void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+
+	unsigned int hist[1024][3];
+	computeHistogram(hist, keys, vertex_count);
+
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[i] = unsigned(i);
+
+	// 3-pass radix sort computes the resulting order into scratch
+	radixPass(scratch, destination, keys, vertex_count, hist, 0);
+	radixPass(destination, scratch, keys, vertex_count, hist, 1);
+	radixPass(scratch, destination, keys, vertex_count, hist, 2);
+
+	// since our remap table is mapping old=>new, we need to reverse it
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[scratch[i]] = unsigned(i);
+}
+
+void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	(void)vertex_count;
+
+	size_t face_count = index_count / 3;
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_Allocator allocator;
+
+	float* centroids = allocator.allocate<float>(face_count * 3);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* va = vertex_positions + a * vertex_stride_float;
+		const float* vb = vertex_positions + b * vertex_stride_float;
+		const float* vc = vertex_positions + c * vertex_stride_float;
+
+		centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f;
+		centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f;
+		centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f;
+	}
+
+	unsigned int* remap = allocator.allocate<unsigned int>(face_count);
+
+	meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3);
+
+	// support in-order remap
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		unsigned int r = remap[i];
+
+		destination[r * 3 + 0] = a;
+		destination[r * 3 + 1] = b;
+		destination[r * 3 + 2] = c;
+	}
+}
diff --git a/thirdparty/meshoptimizer/stripifier.cpp b/thirdparty/meshoptimizer/stripifier.cpp
new file mode 100644
index 0000000000..8ce17ef3dc
--- /dev/null
+++ b/thirdparty/meshoptimizer/stripifier.cpp
@@ -0,0 +1,295 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+// This work is based on:
+// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
+namespace meshopt
+{
+
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+{
+	unsigned int index = 0;
+	unsigned int iv = ~0u;
+
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+		unsigned int v = (va < vb && va < vc) ? va : (vb < vc) ? vb : vc;
+
+		if (v < iv)
+		{
+			index = unsigned(i);
+			iv = v;
+		}
+	}
+
+	return index;
+}
+
+static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
+{
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+		if (e0 == a && e1 == b)
+			return (int(i) << 2) | 2;
+		else if (e0 == b && e1 == c)
+			return (int(i) << 2) | 0;
+		else if (e0 == c && e1 == a)
+			return (int(i) << 2) | 1;
+	}
+
+	return -1;
+}
+
+} // namespace meshopt
+
+size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+	assert(index_count % 3 == 0);
+
+	using namespace meshopt;
+
+	meshopt_Allocator allocator;
+
+	const size_t buffer_capacity = 8;
+
+	unsigned int buffer[buffer_capacity][3] = {};
+	unsigned int buffer_size = 0;
+
+	size_t index_offset = 0;
+
+	unsigned int strip[2] = {};
+	unsigned int parity = 0;
+
+	size_t strip_size = 0;
+
+	// compute vertex valence; this is used to prioritize starting triangle for strips
+	unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
+	memset(valence, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	int next = -1;
+
+	while (buffer_size > 0 || index_offset < index_count)
+	{
+		assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
+
+		// fill triangle buffer
+		while (buffer_size < buffer_capacity && index_offset < index_count)
+		{
+			buffer[buffer_size][0] = indices[index_offset + 0];
+			buffer[buffer_size][1] = indices[index_offset + 1];
+			buffer[buffer_size][2] = indices[index_offset + 2];
+
+			buffer_size++;
+			index_offset += 3;
+		}
+
+		assert(buffer_size > 0);
+
+		if (next >= 0)
+		{
+			unsigned int i = next >> 2;
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+			unsigned int v = buffer[i][next & 3];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// find next triangle (note that edge order flips on every iteration)
+			// in some cases we need to perform a swap to pick a different outgoing triangle edge
+			// for [a b c], the default strip edge is [b c], but we might want to use [a c]
+			int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
+			int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
+
+			if (cont < 0 && swap >= 0)
+			{
+				// [a b c] => [a b a c]
+				destination[strip_size++] = strip[0];
+				destination[strip_size++] = v;
+
+				// next strip has same winding
+				// ? a b => b a v
+				strip[1] = v;
+
+				next = swap;
+			}
+			else
+			{
+				// emit the next vertex in the strip
+				destination[strip_size++] = v;
+
+				// next strip has flipped winding
+				strip[0] = strip[1];
+				strip[1] = v;
+				parity ^= 1;
+
+				next = cont;
+			}
+		}
+		else
+		{
+			// if we didn't find anything, we need to find the next new triangle
+			// we use a heuristic to maximize the strip length
+			unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
+			int ea = findStripNext(buffer, buffer_size, c, b);
+			int eb = findStripNext(buffer, buffer_size, a, c);
+			int ec = findStripNext(buffer, buffer_size, b, a);
+
+			// in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
+			// triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
+			// reasons - slightly improves the stripification efficiency
+			int mine = INT_MAX;
+			mine = (ea >= 0 && mine > ea) ? ea : mine;
+			mine = (eb >= 0 && mine > eb) ? eb : mine;
+			mine = (ec >= 0 && mine > ec) ? ec : mine;
+
+			if (ea == mine)
+			{
+				// keep abc
+				next = ea;
+			}
+			else if (eb == mine)
+			{
+				// abc -> bca
+				unsigned int t = a;
+				a = b, b = c, c = t;
+
+				next = eb;
+			}
+			else if (ec == mine)
+			{
+				// abc -> cab
+				unsigned int t = c;
+				c = b, b = a, a = t;
+
+				next = ec;
+			}
+
+			if (restart_index)
+			{
+				if (strip_size)
+					destination[strip_size++] = restart_index;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = b;
+				destination[strip_size++] = c;
+
+				// new strip always starts with the same edge winding
+				strip[0] = b;
+				strip[1] = c;
+				parity = 1;
+			}
+			else
+			{
+				if (strip_size)
+				{
+					// connect last strip using degenerate triangles
+					destination[strip_size++] = strip[1];
+					destination[strip_size++] = a;
+				}
+
+				// note that we may need to flip the emitted triangle based on parity
+				// we always end up with outgoing edge "cb" in the end
+				unsigned int e0 = parity ? c : b;
+				unsigned int e1 = parity ? b : c;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = e0;
+				destination[strip_size++] = e1;
+
+				strip[0] = e0;
+				strip[1] = e1;
+				parity ^= 1;
+			}
+		}
+	}
+
+	return strip_size;
+}
+
+size_t meshopt_stripifyBound(size_t index_count)
+{
+	assert(index_count % 3 == 0);
+
+	// worst case without restarts is 2 degenerate indices and 3 indices per triangle
+	// worst case with restarts is 1 restart index and 3 indices per triangle
+	return (index_count / 3) * 5;
+}
+
+size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+
+	size_t offset = 0;
+	size_t start = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		if (restart_index && indices[i] == restart_index)
+		{
+			start = i + 1;
+		}
+		else if (i - start >= 2)
+		{
+			unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
+
+			// flip winding for odd triangles
+			if ((i - start) & 1)
+			{
+				unsigned int t = a;
+				a = b, b = t;
+			}
+
+			// although we use restart indices, strip swaps still produce degenerate triangles, so skip them
+			if (a != b && a != c && b != c)
+			{
+				destination[offset + 0] = a;
+				destination[offset + 1] = b;
+				destination[offset + 2] = c;
+				offset += 3;
+			}
+		}
+	}
+
+	return offset;
+}
+
+size_t meshopt_unstripifyBound(size_t index_count)
+{
+	assert(index_count == 0 || index_count >= 3);
+
+	return (index_count == 0) ? 0 : (index_count - 2) * 3;
+}
diff --git a/thirdparty/meshoptimizer/vcacheanalyzer.cpp b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
new file mode 100644
index 0000000000..3682743820
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
@@ -0,0 +1,73 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
+{
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+	assert(warp_size == 0 || warp_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexCacheStatistics result = {};
+
+	unsigned int warp_offset = 0;
+	unsigned int primgroup_offset = 0;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		bool ac = (timestamp - cache_timestamps[a]) > cache_size;
+		bool bc = (timestamp - cache_timestamps[b]) > cache_size;
+		bool cc = (timestamp - cache_timestamps[c]) > cache_size;
+
+		// flush cache if triangle doesn't fit into warp or into the primitive buffer
+		if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
+		{
+			result.warps_executed += warp_offset > 0;
+
+			warp_offset = 0;
+			primgroup_offset = 0;
+
+			// reset cache
+			timestamp += cache_size + 1;
+		}
+
+		// update cache and add vertices to warp
+		for (int j = 0; j < 3; ++j)
+		{
+			unsigned int index = indices[i + j];
+
+			if (timestamp - cache_timestamps[index] > cache_size)
+			{
+				cache_timestamps[index] = timestamp++;
+				result.vertices_transformed++;
+				warp_offset++;
+			}
+		}
+
+		primgroup_offset++;
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += cache_timestamps[i] > 0;
+
+	result.warps_executed += warp_offset > 0;
+
+	result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
+	result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/vcacheoptimizer.cpp b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
new file mode 100644
index 0000000000..fb8ade4b77
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
@@ -0,0 +1,473 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+const size_t kCacheSizeMax = 16;
+const size_t kValenceMax = 8;
+
+struct VertexScoreTable
+{
+	float cache[1 + kCacheSizeMax];
+	float live[1 + kValenceMax];
+};
+
+// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
+static const VertexScoreTable kVertexScoreTable = {
+    {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
+    {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
+};
+
+// Tuned to minimize the encoded index buffer size
+static const VertexScoreTable kVertexScoreTableStrip = {
+    {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
+    {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
+};
+
+struct TriangleAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
+{
+	// check dead-end stack
+	while (dead_end_top)
+	{
+		unsigned int vertex = dead_end[--dead_end_top];
+
+		if (live_triangles[vertex] > 0)
+			return vertex;
+	}
+
+	// input order
+	while (input_cursor < vertex_count)
+	{
+		if (live_triangles[input_cursor] > 0)
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+{
+	unsigned int best_candidate = ~0u;
+	int best_priority = -1;
+
+	for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
+	{
+		unsigned int vertex = *next_candidate;
+
+		// otherwise we don't need to process it
+		if (live_triangles[vertex] > 0)
+		{
+			int priority = 0;
+
+			// will it be in cache after fanning?
+			if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
+			{
+				priority = timestamp - cache_timestamps[vertex]; // position in cache
+			}
+
+			if (priority > best_priority)
+			{
+				best_candidate = vertex;
+				best_priority = priority;
+			}
+		}
+	}
+
+	return best_candidate;
+}
+
+static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
+{
+	assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
+
+	unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
+
+	return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
+}
+
+static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
+{
+	// input order
+	while (input_cursor < face_count)
+	{
+		if (!emitted_flags[input_cursor])
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+	assert(cache_size <= kCacheSizeMax);
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// compute initial vertex scores
+	float* vertex_scores = allocator.allocate<float>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
+
+	// compute triangle scores
+	float* triangle_scores = allocator.allocate<float>(face_count);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0];
+		unsigned int b = indices[i * 3 + 1];
+		unsigned int c = indices[i * 3 + 2];
+
+		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
+	}
+
+	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int* cache = cache_holder;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	size_t cache_count = 0;
+
+	unsigned int current_triangle = 0;
+	unsigned int input_cursor = 1;
+
+	unsigned int output_triangle = 0;
+
+	while (current_triangle != ~0u)
+	{
+		assert(output_triangle < face_count);
+
+		unsigned int a = indices[current_triangle * 3 + 0];
+		unsigned int b = indices[current_triangle * 3 + 1];
+		unsigned int c = indices[current_triangle * 3 + 2];
+
+		// output indices
+		destination[output_triangle * 3 + 0] = a;
+		destination[output_triangle * 3 + 1] = b;
+		destination[output_triangle * 3 + 2] = c;
+		output_triangle++;
+
+		// update emitted flags
+		emitted_flags[current_triangle] = true;
+		triangle_scores[current_triangle] = 0;
+
+		// new triangle
+		size_t cache_write = 0;
+		cache_new[cache_write++] = a;
+		cache_new[cache_write++] = b;
+		cache_new[cache_write++] = c;
+
+		// old triangles
+		for (size_t i = 0; i < cache_count; ++i)
+		{
+			unsigned int index = cache[i];
+
+			if (index != a && index != b && index != c)
+			{
+				cache_new[cache_write++] = index;
+			}
+		}
+
+		unsigned int* cache_temp = cache;
+		cache = cache_new, cache_new = cache_temp;
+		cache_count = cache_write > cache_size ? cache_size : cache_write;
+
+		// update live triangle counts
+		live_triangles[a]--;
+		live_triangles[b]--;
+		live_triangles[c]--;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		for (size_t k = 0; k < 3; ++k)
+		{
+			unsigned int index = indices[current_triangle * 3 + k];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbours_size; ++i)
+			{
+				unsigned int tri = neighbours[i];
+
+				if (tri == current_triangle)
+				{
+					neighbours[i] = neighbours[neighbours_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
+		}
+
+		unsigned int best_triangle = ~0u;
+		float best_score = 0;
+
+		// update cache positions, vertex scores and triangle scores, and find next best triangle
+		for (size_t i = 0; i < cache_write; ++i)
+		{
+			unsigned int index = cache[i];
+
+			int cache_position = i >= cache_size ? -1 : int(i);
+
+			// update vertex score
+			float score = vertexScore(table, cache_position, live_triangles[index]);
+			float score_diff = score - vertex_scores[index];
+
+			vertex_scores[index] = score;
+
+			// update scores of vertex triangles
+			const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
+			const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
+
+			for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+			{
+				unsigned int tri = *it;
+				assert(!emitted_flags[tri]);
+
+				float tri_score = triangle_scores[tri] + score_diff;
+				assert(tri_score > 0);
+
+				if (best_score < tri_score)
+				{
+					best_triangle = tri;
+					best_score = tri_score;
+				}
+
+				triangle_scores[tri] = tri_score;
+			}
+		}
+
+		// step through input triangles in order if we hit a dead-end
+		current_triangle = best_triangle;
+
+		if (current_triangle == ~0u)
+		{
+			current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
+		}
+	}
+
+	assert(input_cursor == face_count);
+	assert(output_triangle == face_count);
+}
+
+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
+}
+
+void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
+}
+
+void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// cache time stamps
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	// dead-end stack
+	unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
+	unsigned int dead_end_top = 0;
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	unsigned int current_vertex = 0;
+
+	unsigned int timestamp = cache_size + 1;
+	unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
+
+	unsigned int output_triangle = 0;
+
+	while (current_vertex != ~0u)
+	{
+		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
+
+		// emit all vertex neighbours
+		const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+		const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
+
+		for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+		{
+			unsigned int triangle = *it;
+
+			if (!emitted_flags[triangle])
+			{
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+				// output indices
+				destination[output_triangle * 3 + 0] = a;
+				destination[output_triangle * 3 + 1] = b;
+				destination[output_triangle * 3 + 2] = c;
+				output_triangle++;
+
+				// update dead-end stack
+				dead_end[dead_end_top + 0] = a;
+				dead_end[dead_end_top + 1] = b;
+				dead_end[dead_end_top + 2] = c;
+				dead_end_top += 3;
+
+				// update live triangle counts
+				live_triangles[a]--;
+				live_triangles[b]--;
+				live_triangles[c]--;
+
+				// update cache info
+				// if vertex is not in cache, put it in cache
+				if (timestamp - cache_timestamps[a] > cache_size)
+					cache_timestamps[a] = timestamp++;
+
+				if (timestamp - cache_timestamps[b] > cache_size)
+					cache_timestamps[b] = timestamp++;
+
+				if (timestamp - cache_timestamps[c] > cache_size)
+					cache_timestamps[c] = timestamp++;
+
+				// update emitted flags
+				emitted_flags[triangle] = true;
+			}
+		}
+
+		// next candidates are the ones we pushed to dead-end stack just now
+		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
+
+		// get next vertex
+		current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+
+		if (current_vertex == ~0u)
+		{
+			current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
+		}
+	}
+
+	assert(output_triangle == face_count);
+}
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
new file mode 100644
index 0000000000..784c9a13db
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -0,0 +1,1265 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
+#if defined(__AVX__) || defined(__SSSE3__)
+#define SIMD_SSE
+#endif
+
+// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
+#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
+#undef SIMD_SSE
+#define SIMD_AVX
+#endif
+
+// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#endif
+
+// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#define SIMD_TARGET __attribute__((target("ssse3")))
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#ifndef SIMD_TARGET
+#define SIMD_TARGET
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <tmmintrin.h>
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+#ifdef _MSC_VER
+#include <intrin.h> // __cpuid
+#else
+#include <cpuid.h> // __cpuid
+#endif
+#endif
+
+#ifdef SIMD_AVX
+#include <immintrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
+#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
+#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
+#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
+#endif
+
+namespace meshopt
+{
+
+const unsigned char kVertexHeader = 0xa0;
+
+static int gEncodeVertexVersion = 0;
+
+const size_t kVertexBlockSizeBytes = 8192;
+const size_t kVertexBlockMaxSize = 256;
+const size_t kByteGroupSize = 16;
+const size_t kByteGroupDecodeLimit = 24;
+const size_t kTailMaxSize = 32;
+
+static size_t getVertexBlockSize(size_t vertex_size)
+{
+	// make sure the entire block fits into the scratch buffer
+	size_t result = kVertexBlockSizeBytes / vertex_size;
+
+	// align to byte group size; we encode each byte as a byte group
+	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
+	result &= ~(kByteGroupSize - 1);
+
+	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
+}
+
+inline unsigned char zigzag8(unsigned char v)
+{
+	return ((signed char)(v) >> 7) ^ (v << 1);
+}
+
+inline unsigned char unzigzag8(unsigned char v)
+{
+	return -(v & 1) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+	size_t size;
+	size_t header;
+	size_t bitg[4];
+	size_t bitb[4];
+};
+
+Stats* bytestats;
+Stats vertexstats[256];
+#endif
+
+static bool encodeBytesGroupZero(const unsigned char* buffer)
+{
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		if (buffer[i])
+			return false;
+
+	return true;
+}
+
+static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
+
+	if (bits == 8)
+		return kByteGroupSize;
+
+	size_t result = kByteGroupSize * bits / 8;
+
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		result += buffer[i] >= sentinel;
+
+	return result;
+}
+
+static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return data;
+
+	if (bits == 8)
+	{
+		memcpy(data, buffer, kByteGroupSize);
+		return data + kByteGroupSize;
+	}
+
+	size_t byte_size = 8 / bits;
+	assert(kByteGroupSize % byte_size == 0);
+
+	// fixed portion: bits bits for each value
+	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
+	{
+		unsigned char byte = 0;
+
+		for (size_t k = 0; k < byte_size; ++k)
+		{
+			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
+
+			byte <<= bits;
+			byte |= enc;
+		}
+
+		*data++ = byte;
+	}
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+	{
+		if (buffer[i] >= sentinel)
+		{
+			*data++ = buffer[i];
+		}
+	}
+
+	return data;
+}
+
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	memset(header, 0, header_size);
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		int best_bits = 8;
+		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+
+		for (int bits = 1; bits < 8; bits *= 2)
+		{
+			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+
+			if (size < best_size)
+			{
+				best_bits = bits;
+				best_size = size;
+			}
+		}
+
+		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
+		assert((1 << bitslog2) == best_bits);
+
+		size_t header_offset = i / kByteGroupSize;
+
+		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
+
+		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
+
+		assert(data + best_size == next);
+		data = next;
+
+#if TRACE > 1
+		bytestats->bitg[bitslog2]++;
+		bytestats->bitb[bitslog2] += best_size;
+#endif
+	}
+
+#if TRACE > 1
+	bytestats->header += header_size;
+#endif
+
+	return data;
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	assert(sizeof(buffer) % kByteGroupSize == 0);
+
+	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+	memset(buffer, 0, sizeof(buffer));
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+
+			p = vertex_data[vertex_offset];
+
+			vertex_offset += vertex_size;
+		}
+
+#if TRACE
+		const unsigned char* olddata = data;
+		bytestats = &vertexstats[k];
+#endif
+
+		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
+		if (!data)
+			return 0;
+
+#if TRACE
+		bytestats = 0;
+		vertexstats[k].size += data - olddata;
+#endif
+	}
+
+	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+#define READ() byte = *data++
+#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
+
+	unsigned char byte, enc, encv;
+	const unsigned char* data_var;
+
+	switch (bitslog2)
+	{
+	case 0:
+		memset(buffer, 0, kByteGroupSize);
+		return data;
+	case 1:
+		data_var = data + 4;
+
+		// 4 groups with 4 2-bit values in each byte
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+
+		return data_var;
+	case 2:
+		data_var = data + 8;
+
+		// 8 groups with 2 4-bit values in each byte
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+
+		return data_var;
+	case 3:
+		memcpy(buffer, data, kByteGroupSize);
+		return data + kByteGroupSize;
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+
+#undef READ
+#undef NEXT
+}
+
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroup(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
+		if (!data)
+			return 0;
+
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			unsigned char v = unzigzag8(buffer[i]) + p;
+
+			transposed[vertex_offset] = v;
+			p = v;
+
+			vertex_offset += vertex_size;
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+static unsigned char kDecodeBytesGroupShuffle[256][8];
+static unsigned char kDecodeBytesGroupCount[256];
+
+#ifdef __wasm__
+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
+#endif
+static bool
+decodeBytesGroupBuildTables()
+{
+	for (int mask = 0; mask < 256; ++mask)
+	{
+		unsigned char shuffle[8];
+		unsigned char count = 0;
+
+		for (int i = 0; i < 8; ++i)
+		{
+			int maski = (mask >> i) & 1;
+			shuffle[i] = maski ? count : 0x80;
+			count += (unsigned char)(maski);
+		}
+
+		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
+		kDecodeBytesGroupCount[mask] = count;
+	}
+
+	return true;
+}
+
+static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
+#endif
+
+#ifdef SIMD_SSE
+SIMD_TARGET
+static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
+	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
+	__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
+
+	__m128i sm1r = _mm_add_epi8(sm1, sm1off);
+
+	return _mm_unpacklo_epi64(sm0, sm1r);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		__m128i result = _mm_setzero_si128();
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data;
+	}
+
+	case 1:
+	{
+#ifdef __GNUC__
+		typedef int __attribute__((aligned(1))) unaligned_int;
+#else
+		typedef int unaligned_int;
+#endif
+
+		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
+
+		__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
+		__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
+		__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
+
+		__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
+		__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_AVX
+static const __m128i decodeBytesGroupConfig[] = {
+    _mm_set1_epi8(3),
+    _mm_set1_epi8(15),
+    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
+    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+};
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		__m128i result = _mm_setzero_si128();
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data;
+	}
+
+	case 1:
+	case 2:
+	{
+		const unsigned char* skip = data + (bitslog2 << 2);
+
+		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
+
+		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
+		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+
+		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
+		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
+		__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
+
+		__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return skip + _mm_popcnt_u32(mask16);
+	}
+
+	case 3:
+	{
+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_NEON
+static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+{
+	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
+	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
+
+	uint8x8_t r0 = vtbl1_u8(rest0, sm0);
+	uint8x8_t r1 = vtbl1_u8(rest1, sm1);
+
+	return vcombine_u8(r0, r1);
+}
+
+static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
+
+	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
+	uint8x16_t masked = vandq_u8(mask, byte_mask);
+
+#ifdef __aarch64__
+	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
+	mask0 = vaddv_u8(vget_low_u8(masked));
+	mask1 = vaddv_u8(vget_high_u8(masked));
+#else
+	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
+	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
+	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
+	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
+
+	mask0 = vget_lane_u8(sum3, 0);
+	mask1 = vget_lane_u8(sum3, 1);
+#endif
+}
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		uint8x16_t result = vdupq_n_u8(0);
+
+		vst1q_u8(buffer, result);
+
+		return data;
+	}
+
+	case 1:
+	{
+		uint8x8_t sel2 = vld1_u8(data);
+		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
+		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
+		uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 4);
+		uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		uint8x8_t sel4 = vld1_u8(data);
+		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
+		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 8);
+		uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		uint8x16_t result = vld1q_u8(data);
+
+		vst1q_u8(buffer, result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
+	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
+
+	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
+	sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
+
+	return wasmx_unpacklo_v64x2(sm0, sm1r);
+}
+
+SIMD_TARGET
+static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+	v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
+
+	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
+	uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
+
+	// TODO: This can use v8x16_bitmask in the future
+	uint64_t mask_2 = mask_1a | mask_1b;
+	uint64_t mask_4 = mask_2 | (mask_2 >> 16);
+	uint64_t mask_8 = mask_4 | (mask_4 >> 8);
+
+	mask0 = uint8_t(mask_8);
+	mask1 = uint8_t(mask_8 >> 32);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	unsigned char byte, enc, encv;
+	const unsigned char* data_var;
+
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		v128_t result = wasm_i8x16_splat(0);
+
+		wasm_v128_store(buffer, result);
+
+		return data;
+	}
+
+	case 1:
+	{
+		v128_t sel2 = wasm_v128_load(data);
+		v128_t rest = wasm_v128_load(data + 4);
+
+		v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
+		v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
+		v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
+
+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
+
+		unsigned char mask0, mask1;
+		wasmMoveMask(mask, mask0, mask1);
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		v128_t sel4 = wasm_v128_load(data);
+		v128_t rest = wasm_v128_load(data + 8);
+
+		v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
+		v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
+
+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
+
+		unsigned char mask0, mask1;
+		wasmMoveMask(mask, mask0, mask1);
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		v128_t result = wasm_v128_load(data);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+SIMD_TARGET
+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+{
+	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
+	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
+	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
+	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
+
+	x0 = _mm_unpacklo_epi16(t0, t2);
+	x1 = _mm_unpackhi_epi16(t0, t2);
+	x2 = _mm_unpacklo_epi16(t1, t3);
+	x3 = _mm_unpackhi_epi16(t1, t3);
+}
+
+SIMD_TARGET
+static __m128i unzigzag8(__m128i v)
+{
+	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
+	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
+
+	return _mm_xor_si128(xl, xr);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+{
+	uint8x16x2_t t01 = vzipq_u8(x0, x1);
+	uint8x16x2_t t23 = vzipq_u8(x2, x3);
+
+	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
+	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
+
+	x0 = vreinterpretq_u8_u16(x01.val[0]);
+	x1 = vreinterpretq_u8_u16(x01.val[1]);
+	x2 = vreinterpretq_u8_u16(x23.val[0]);
+	x3 = vreinterpretq_u8_u16(x23.val[1]);
+}
+
+static uint8x16_t unzigzag8(uint8x16_t v)
+{
+	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
+	uint8x16_t xr = vshrq_n_u8(v, 1);
+
+	return veorq_u8(xl, xr);
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+{
+	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
+	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
+	v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
+	v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
+
+	x0 = wasmx_unpacklo_v16x8(t0, t2);
+	x1 = wasmx_unpackhi_v16x8(t0, t2);
+	x2 = wasmx_unpacklo_v16x8(t1, t3);
+	x3 = wasmx_unpackhi_v16x8(t1, t3);
+}
+
+SIMD_TARGET
+static v128_t unzigzag8(v128_t v)
+{
+	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
+	v128_t xr = wasm_u8x16_shr(v, 1);
+
+	return wasm_v128_xor(xl, xr);
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+SIMD_TARGET
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+	assert(kByteGroupSize == 16);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	size_t i = 0;
+
+	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
+	{
+		size_t header_offset = i / kByteGroupSize;
+		unsigned char header_byte = header[header_offset / 4];
+
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+	}
+
+	// slow-path: process remaining groups
+	for (; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+SIMD_TARGET
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize * 4];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; k += 4)
+	{
+		for (size_t j = 0; j < 4; ++j)
+		{
+			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
+			if (!data)
+				return 0;
+		}
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
+#endif
+
+		PREP();
+
+		unsigned char* savep = transposed + k;
+
+		for (size_t j = 0; j < vertex_count_aligned; j += 16)
+		{
+			LOAD(0);
+			LOAD(1);
+			LOAD(2);
+			LOAD(3);
+
+			r0 = unzigzag8(r0);
+			r1 = unzigzag8(r1);
+			r2 = unzigzag8(r2);
+			r3 = unzigzag8(r3);
+
+			transpose8(r0, r1, r2, r3);
+
+			TEMP t0, t1, t2, t3;
+
+			GRP4(0);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(1);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(2);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(3);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+static unsigned int getCpuFeatures()
+{
+	int cpuinfo[4] = {};
+#ifdef _MSC_VER
+	__cpuid(cpuinfo, 1);
+#else
+	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
+#endif
+	return cpuinfo[2];
+}
+
+unsigned int cpuid = getCpuFeatures();
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+#if TRACE
+	memset(vertexstats, 0, sizeof(vertexstats));
+#endif
+
+	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
+
+	unsigned char* data = buffer;
+	unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return 0;
+
+	int version = gEncodeVertexVersion;
+
+	*data++ = (unsigned char)(kVertexHeader | version);
+
+	unsigned char first_vertex[256] = {};
+	if (vertex_count > 0)
+		memcpy(first_vertex, vertex_data, vertex_size);
+
+	unsigned char last_vertex[256] = {};
+	memcpy(last_vertex, first_vertex, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return 0;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) < tail_size)
+		return 0;
+
+	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
+	if (vertex_size < kTailMaxSize)
+	{
+		memset(data, 0, kTailMaxSize - vertex_size);
+		data += kTailMaxSize - vertex_size;
+	}
+
+	memcpy(data, first_vertex, vertex_size);
+	data += vertex_size;
+
+	assert(data >= buffer + tail_size);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	size_t total_size = data - buffer;
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		const Stats& vsk = vertexstats[k];
+
+		printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+#if TRACE > 1
+		printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
+		       int(vsk.header),
+		       int(vsk.bitg[0]), int(vsk.bitb[0]),
+		       int(vsk.bitg[1]), int(vsk.bitb[1]),
+		       int(vsk.bitg[2]), int(vsk.bitb[2]),
+		       int(vsk.bitg[3]), int(vsk.bitb[3]));
+#endif
+
+		printf("\n");
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
+
+	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
+	size_t vertex_block_data_size = vertex_block_size;
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+}
+
+void meshopt_encodeVertexVersion(int version)
+{
+	assert(unsigned(version) <= 0);
+
+	meshopt::gEncodeVertexVersion = version;
+}
+
+int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
+#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decode = decodeVertexBlockSimd;
+#else
+	decode = decodeVertexBlock;
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	assert(gDecodeBytesGroupInitialized);
+	(void)gDecodeBytesGroupInitialized;
+#endif
+
+	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
+
+	const unsigned char* data = buffer;
+	const unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return -2;
+
+	unsigned char data_header = *data++;
+
+	if ((data_header & 0xf0) != kVertexHeader)
+		return -1;
+
+	int version = data_header & 0x0f;
+	if (version > 0)
+		return -1;
+
+	unsigned char last_vertex[256];
+	memcpy(last_vertex, data_end - vertex_size, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return -2;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) != tail_size)
+		return -3;
+
+	return 0;
+}
+
+#undef SIMD_NEON
+#undef SIMD_SSE
+#undef SIMD_AVX
+#undef SIMD_WASM
+#undef SIMD_FALLBACK
+#undef SIMD_TARGET
diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp
new file mode 100644
index 0000000000..e7ad2c9d39
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexfilter.cpp
@@ -0,0 +1,825 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <math.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
+#if defined(__SSE2__)
+#define SIMD_SSE
+#endif
+
+// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <emmintrin.h>
+#include <stdint.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
+#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
+#endif
+
+namespace meshopt
+{
+
+#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
+template <typename T>
+static void decodeFilterOct(T* data, size_t count)
+{
+	const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float x = float(data[i * 4 + 0]);
+		float y = float(data[i * 4 + 1]);
+		float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
+
+		// fixup octahedral coordinates for z<0
+		float t = (z >= 0.f) ? 0.f : z;
+
+		x += (x >= 0.f) ? t : -t;
+		y += (y >= 0.f) ? t : -t;
+
+		// compute normal length & scale
+		float l = sqrtf(x * x + y * y + z * z);
+		float s = max / l;
+
+		// rounded signed float->int
+		int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
+
+		data[i * 4 + 0] = T(xf);
+		data[i * 4 + 1] = T(yf);
+		data[i * 4 + 2] = T(zf);
+	}
+}
+
+static void decodeFilterQuat(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// recover scale from the high byte of the component
+		int sf = data[i * 4 + 3] | 3;
+		float ss = scale / float(sf);
+
+		// convert x/y/z to [-1..1] (scaled...)
+		float x = float(data[i * 4 + 0]) * ss;
+		float y = float(data[i * 4 + 1]) * ss;
+		float z = float(data[i * 4 + 2]) * ss;
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		float ww = 1.f - x * x - y * y - z * z;
+		float w = sqrtf(ww >= 0.f ? ww : 0.f);
+
+		// rounded signed float->int
+		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
+		int wf = int(w * 32767.f + 0.5f);
+
+		int qc = data[i * 4 + 3] & 3;
+
+		// output order is dictated by input index
+		data[i * 4 + ((qc + 1) & 3)] = short(xf);
+		data[i * 4 + ((qc + 2) & 3)] = short(yf);
+		data[i * 4 + ((qc + 3) & 3)] = short(zf);
+		data[i * 4 + ((qc + 0) & 3)] = short(wf);
+	}
+}
+
+static void decodeFilterExp(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int v = data[i];
+
+		// decode mantissa and exponent
+		int m = int(v << 8) >> 8;
+		int e = int(v) >> 24;
+
+		union
+		{
+			float f;
+			unsigned int ui;
+		} u;
+
+		// optimized version of ldexp(float(m), e)
+		u.ui = unsigned(e + 127) << 23;
+		u.f = u.f * float(m);
+
+		data[i] = u.ui;
+	}
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+inline uint64_t rotateleft64(uint64_t v, int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+	return _rotl64(v, x);
+// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
+// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
+#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
+	return __builtin_rotateleft64(v, x);
+#else
+	return (v << (x & 63)) | (v >> ((64 - x) & 63));
+#endif
+}
+#endif
+
+#ifdef SIMD_SSE
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const __m128 sign = _mm_set1_ps(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
+		__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+		// fixup octahedral coordinates for z<0
+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+		// compute normal length & scale
+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+		__m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+		// combine xr/yr/zr into final value
+		__m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
+		res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const __m128 sign = _mm_set1_ps(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+		__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
+		__m128i yf = _mm_srai_epi32(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+		__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+		// fixup octahedral coordinates for z<0
+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+		// compute normal length & scale
+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+		__m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+		__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
+		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
+
+		// patch in .w
+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+		__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+		__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
+		__m128i yf = _mm_srai_epi32(q4_xy, 16);
+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
+		__m128i cf = _mm_srai_epi32(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
+		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
+		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
+
+		__m128 s = _mm_set1_ps(32767.f);
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+		__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
+		__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
+
+		// store results to stack so that we can rotate using scalar instructions
+		uint64_t res[4];
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
+
+		// rotate and store
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
+
+		// decode exponent into 2^x directly
+		__m128i ef = _mm_srai_epi32(v, 24);
+		__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
+		__m128 m = _mm_cvtepi32_ps(mf);
+
+		__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
+
+		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
+	}
+}
+#endif
+
+#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
+inline float32x4_t vsqrtq_f32(float32x4_t x)
+{
+	float32x4_t r = vrsqrteq_f32(x);
+	r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
+	return vmulq_f32(r, x);
+}
+
+inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
+{
+	float32x4_t r = vrecpeq_f32(y);
+	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
+	return vmulq_f32(x, r);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
+		int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+		// fixup octahedral coordinates for z<0
+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+		// compute normal length & scale
+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t rl = vrsqrteq_f32(ll);
+		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+		// combine xr/yr/zr into final value
+		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
+		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
+		int32x4_t yf = vshrq_n_s32(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
+		int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+		// fixup octahedral coordinates for z<0
+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+		// compute normal length & scale
+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t rl = vrsqrteq_f32(ll);
+		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
+		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
+
+		// patch in .w
+		res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
+		res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
+		int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
+		int32x4_t yf = vshrq_n_s32(q4_xy, 16);
+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
+		int32x4_t cf = vshrq_n_s32(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
+		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
+		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
+
+		float32x4_t s = vdupq_n_f32(32767.f);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+
+		// rotate and store
+		uint64_t* out = (uint64_t*)&data[i * 4];
+
+		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
+		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
+		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
+		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
+
+		// decode exponent into 2^x directly
+		int32x4_t ef = vshrq_n_s32(v, 24);
+		int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
+		float32x4_t m = vcvtq_f32_s32(mf);
+
+		float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
+
+		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
+	}
+}
+#endif
+
+#ifdef SIMD_WASM
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const v128_t sign = wasm_f32x4_splat(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t n4 = wasm_v128_load(&data[i * 4]);
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
+		v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+		// fixup octahedral coordinates for z<0
+		// note: i32x4_min with 0 is equvalent to f32x4_min
+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+		// compute normal length & scale
+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+		// combine xr/yr/zr into final value
+		v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
+		res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
+
+		wasm_v128_store(&data[i * 4], res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const v128_t sign = wasm_f32x4_splat(-0.f);
+	const v128_t zmask = wasm_i32x4_splat(0x7fff);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
+		v128_t yf = wasm_i32x4_shr(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
+		v128_t zf = wasm_v128_and(z4, zmask);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+		// fixup octahedral coordinates for z<0
+		// note: i32x4_min with 0 is equvalent to f32x4_min
+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+		// compute normal length & scale
+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+		v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
+		v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
+
+		// patch in .w
+		res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
+		res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
+
+		wasm_v128_store(&data[(i + 0) * 4], res_0);
+		wasm_v128_store(&data[(i + 2) * 4], res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
+		v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
+		v128_t yf = wasm_i32x4_shr(q4_xy, 16);
+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
+		v128_t cf = wasm_i32x4_shr(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
+		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
+		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		// note: i32x4_max with 0 is equivalent to f32x4_max
+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
+		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
+
+		v128_t s = wasm_f32x4_splat(32767.f);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+		v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
+		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
+
+		// compute component index shifted left by 4 (and moved into i32x4 slot)
+		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
+		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
+
+		// rotate and store
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+		out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
+		out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
+		out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
+		out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t v = wasm_v128_load(&data[i]);
+
+		// decode exponent into 2^x directly
+		v128_t ef = wasm_i32x4_shr(v, 24);
+		v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
+		v128_t m = wasm_f32x4_convert_i32x4(mf);
+
+		v128_t r = wasm_f32x4_mul(es, m);
+
+		wasm_v128_store(&data[i], r);
+	}
+}
+#endif
+
+} // namespace meshopt
+
+void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size == 4 || vertex_size == 8);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	if (vertex_size == 4)
+		decodeFilterOctSimd(static_cast<signed char*>(buffer), vertex_count);
+	else
+		decodeFilterOctSimd(static_cast<short*>(buffer), vertex_count);
+#else
+	if (vertex_size == 4)
+		decodeFilterOct(static_cast<signed char*>(buffer), vertex_count);
+	else
+		decodeFilterOct(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size == 8);
+	(void)vertex_size;
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decodeFilterQuatSimd(static_cast<short*>(buffer), vertex_count);
+#else
+	decodeFilterQuat(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size % 4 == 0);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decodeFilterExpSimd(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#else
+	decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#endif
+}
+
+#undef SIMD_SSE
+#undef SIMD_NEON
+#undef SIMD_WASM
diff --git a/thirdparty/meshoptimizer/vfetchanalyzer.cpp b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
new file mode 100644
index 0000000000..51dca873f8
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
@@ -0,0 +1,58 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexFetchStatistics result = {};
+
+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+	memset(vertex_visited, 0, vertex_count);
+
+	const size_t kCacheLine = 64;
+	const size_t kCacheSize = 128 * 1024;
+
+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+	size_t cache[kCacheSize / kCacheLine] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		vertex_visited[index] = 1;
+
+		size_t start_address = index * vertex_size;
+		size_t end_address = start_address + vertex_size;
+
+		size_t start_tag = start_address / kCacheLine;
+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+		assert(start_tag < end_tag);
+
+		for (size_t tag = start_tag; tag < end_tag; ++tag)
+		{
+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+			// we store +1 since cache is filled with 0 by default
+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+			cache[line] = tag + 1;
+		}
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += vertex_visited[i];
+
+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/vfetchoptimizer.cpp b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
new file mode 100644
index 0000000000..465d6df5ca
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
@@ -0,0 +1,74 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			destination[index] = next_vertex++;
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place optimization
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	// build vertex remap table
+	unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		unsigned int& remap = vertex_remap[index];
+
+		if (remap == ~0u) // vertex was not added to destination VB
+		{
+			// add vertex
+			memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
+
+			remap = next_vertex++;
+		}
+
+		// modify indices in place
+		indices[i] = remap;
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}