Merge pull request #41323 from lawnjelly/kessel_lightangles

GLES2 2D fix normal mapping - batching and nvidia workaround
2020-09-28 18:45:43 +02:00 · 2020-09-28 18:45:43 +02:00 · 422c279fcb
parent cd05197fb3 ecd39094ed
commit 422c279fcb
7 changed files with 691 additions and 327 deletions
--- a/drivers/gles2/rasterizer_array_gles2.h
+++ b/drivers/gles2/rasterizer_array_gles2.h
@ -71,6 +71,75 @@

 #include <string.h>

+// very simple non-growable array, that keeps track of the size of a 'unit'
+// which can be cast to whatever vertex format FVF required, and is initially
+// created with enough memory to hold the biggest FVF.
+// This allows multiple FVFs to use the same array.
+class RasterizerUnitArrayGLES2 {
+public:
+	RasterizerUnitArrayGLES2() {
+		_list = nullptr;
+		free();
+	}
+	~RasterizerUnitArrayGLES2() { free(); }
+
+	uint8_t *get_unit(unsigned int ui) { return &_list[ui * _unit_size_bytes]; }
+	const uint8_t *get_unit(unsigned int ui) const { return &_list[ui * _unit_size_bytes]; }
+
+	int size() const { return _size; }
+	int max_size() const { return _max_size; }
+
+	void free() {
+		if (_list) {
+			memdelete_arr(_list);
+			_list = 0;
+		}
+		_size = 0;
+		_max_size = 0;
+		_max_size_bytes = 0;
+		_unit_size_bytes = 0;
+	}
+
+	void create(int p_max_size_units, int p_max_unit_size_bytes) {
+		free();
+
+		_max_unit_size_bytes = p_max_unit_size_bytes;
+		_max_size = p_max_size_units;
+		_max_size_bytes = p_max_size_units * p_max_unit_size_bytes;
+
+		if (_max_size_bytes) {
+			_list = memnew_arr(uint8_t, _max_size_bytes);
+		}
+	}
+
+	void prepare(int p_unit_size_bytes) {
+		_unit_size_bytes = p_unit_size_bytes;
+		_size = 0;
+	}
+
+	// several items at a time
+	uint8_t *request(int p_num_items = 1) {
+		int old_size = _size;
+		_size += p_num_items;
+
+		if (_size <= _max_size) {
+			return get_unit(old_size);
+		}
+
+		// revert
+		_size = old_size;
+		return nullptr;
+	}
+
+private:
+	uint8_t *_list;
+	int _size; // in units
+	int _max_size; // in units
+	int _max_size_bytes;
+	int _unit_size_bytes;
+	int _max_unit_size_bytes;
+};
+
 template <class T>
 class RasterizerArrayGLES2 {
 public:
--- a/drivers/gles2/rasterizer_canvas_base_gles2.cpp
+++ b/drivers/gles2/rasterizer_canvas_base_gles2.cpp
@ -52,8 +52,13 @@ void RasterizerCanvasBaseGLES2::light_internal_free(RID p_rid) {

 void RasterizerCanvasBaseGLES2::canvas_begin() {

-	state.canvas_shader.bind();
 	state.using_transparent_rt = false;
+
+	// always start with light_angle unset
+	state.using_light_angle = false;
+	state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_LIGHT_ANGLE, false);
+	state.canvas_shader.bind();
+
 	int viewport_x, viewport_y, viewport_width, viewport_height;

 	if (storage->frame.current_rt) {
@ -155,6 +160,16 @@ void RasterizerCanvasBaseGLES2::draw_generic_textured_rect(const Rect2 &p_rect,
 	glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
 }

+void RasterizerCanvasBaseGLES2::_set_texture_rect_mode(bool p_texture_rect, bool p_light_angle) {
+	// always set this directly (this could be state checked)
+	state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, p_texture_rect);
+
+	if (state.using_light_angle != p_light_angle) {
+		state.using_light_angle = p_light_angle;
+		state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_LIGHT_ANGLE, p_light_angle);
+	}
+}
+
 RasterizerStorageGLES2::Texture *RasterizerCanvasBaseGLES2::_bind_canvas_texture(const RID &p_texture, const RID &p_normal_map) {

 	RasterizerStorageGLES2::Texture *tex_return = NULL;
@ -595,12 +610,13 @@ void RasterizerCanvasBaseGLES2::_draw_generic_indices(GLuint p_primitive, const
 	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
 }

-void RasterizerCanvasBaseGLES2::_draw_gui_primitive(int p_points, const Vector2 *p_vertices, const Color *p_colors, const Vector2 *p_uvs) {
+void RasterizerCanvasBaseGLES2::_draw_gui_primitive(int p_points, const Vector2 *p_vertices, const Color *p_colors, const Vector2 *p_uvs, const float *p_light_angles) {

 	static const GLenum prim[5] = { GL_POINTS, GL_POINTS, GL_LINES, GL_TRIANGLES, GL_TRIANGLE_FAN };

 	int color_offset = 0;
 	int uv_offset = 0;
+	int light_angle_offset = 0;
 	int stride = 2;

 	if (p_colors) {
@ -613,7 +629,12 @@ void RasterizerCanvasBaseGLES2::_draw_gui_primitive(int p_points, const Vector2
 		stride += 2;
 	}

-	float buffer_data[(2 + 2 + 4) * 4];
+	if (p_light_angles) { //light_angles
+		light_angle_offset = stride;
+		stride += 1;
+	}
+
+	float buffer_data[(2 + 2 + 4 + 1) * 4];

 	for (int i = 0; i < p_points; i++) {
 		buffer_data[stride * i + 0] = p_vertices[i].x;
@ -636,6 +657,12 @@ void RasterizerCanvasBaseGLES2::_draw_gui_primitive(int p_points, const Vector2
 		}
 	}

+	if (p_light_angles) {
+		for (int i = 0; i < p_points; i++) {
+			buffer_data[stride * i + light_angle_offset + 0] = p_light_angles[i];
+		}
+	}
+
 	glBindBuffer(GL_ARRAY_BUFFER, data.polygon_buffer);
 #ifndef GLES_OVER_GL
 	// Orphan the buffer to avoid CPU/GPU sync points caused by glBufferSubData
@ -655,9 +682,19 @@ void RasterizerCanvasBaseGLES2::_draw_gui_primitive(int p_points, const Vector2
 		glEnableVertexAttribArray(VS::ARRAY_TEX_UV);
 	}

+	if (p_light_angles) {
+		glVertexAttribPointer(VS::ARRAY_TANGENT, 1, GL_FLOAT, GL_FALSE, stride * sizeof(float), CAST_INT_TO_UCHAR_PTR(light_angle_offset * sizeof(float)));
+		glEnableVertexAttribArray(VS::ARRAY_TANGENT);
+	}
+
 	glDrawArrays(prim[p_points], 0, p_points);
 	storage->info.render._2d_draw_call_count++;

+	if (p_light_angles) {
+		// may not be needed
+		glDisableVertexAttribArray(VS::ARRAY_TANGENT);
+	}
+
 	glBindBuffer(GL_ARRAY_BUFFER, 0);
 }

@ -993,7 +1030,7 @@ void RasterizerCanvasBaseGLES2::initialize() {

 	state.canvas_shader.init();

-	state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, true);
+	_set_texture_rect_mode(true);
 	state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_RGBA_SHADOWS, storage->config.use_rgba_2d_shadows);

 	state.canvas_shader.bind();
--- a/drivers/gles2/rasterizer_canvas_base_gles2.h
+++ b/drivers/gles2/rasterizer_canvas_base_gles2.h
@ -77,6 +77,7 @@ public:
 		LensDistortedShaderGLES2 lens_shader;

 		bool using_texture_rect;
+		bool using_light_angle;
 		bool using_ninepatch;
 		bool using_skeleton;

@ -112,7 +113,7 @@ public:
 	virtual void canvas_begin();
 	virtual void canvas_end();

-	void _draw_gui_primitive(int p_points, const Vector2 *p_vertices, const Color *p_colors, const Vector2 *p_uvs);
+	void _draw_gui_primitive(int p_points, const Vector2 *p_vertices, const Color *p_colors, const Vector2 *p_uvs, const float *p_light_angles = nullptr);
 	void _draw_polygon(const int *p_indices, int p_index_count, int p_vertex_count, const Vector2 *p_vertices, const Vector2 *p_uvs, const Color *p_colors, bool p_singlecolor, const float *p_weights = NULL, const int *p_bones = NULL);
 	void _draw_generic(GLuint p_primitive, int p_vertex_count, const Vector2 *p_vertices, const Vector2 *p_uvs, const Color *p_colors, bool p_singlecolor);
 	void _draw_generic_indices(GLuint p_primitive, const int *p_indices, int p_index_count, int p_vertex_count, const Vector2 *p_vertices, const Vector2 *p_uvs, const Color *p_colors, bool p_singlecolor);
@ -130,6 +131,7 @@ public:
 	virtual void canvas_debug_viewport_shadows(Light *p_lights_with_shadow);

 	RasterizerStorageGLES2::Texture *_bind_canvas_texture(const RID &p_texture, const RID &p_normal_map);
+	void _set_texture_rect_mode(bool p_texture_rect, bool p_light_angle = false);

 	void initialize();
 	void finalize();
--- a/drivers/gles2/rasterizer_canvas_gles2.cpp
+++ b/drivers/gles2/rasterizer_canvas_gles2.cpp
@ -55,6 +55,7 @@ RasterizerCanvasGLES2::BatchData::BatchData() {
 	index_buffer_size_units = 0;
 	index_buffer_size_bytes = 0;
 	use_colored_vertices = false;
+	use_light_angles = false;
 	settings_use_batching = false;
 	settings_max_join_item_commands = 0;
 	settings_colored_vertex_format_threshold = 0.0f;
@ -212,10 +213,14 @@ void RasterizerCanvasGLES2::_batch_upload_buffers() {
 	// orphan the old (for now)
 	glBufferData(GL_ARRAY_BUFFER, 0, 0, GL_DYNAMIC_DRAW);

-	if (!bdata.use_colored_vertices) {
-		glBufferData(GL_ARRAY_BUFFER, sizeof(BatchVertex) * bdata.vertices.size(), bdata.vertices.get_data(), GL_DYNAMIC_DRAW);
+	if (!bdata.use_light_angles) {
+		if (!bdata.use_colored_vertices) {
+			glBufferData(GL_ARRAY_BUFFER, sizeof(BatchVertex) * bdata.vertices.size(), bdata.vertices.get_data(), GL_DYNAMIC_DRAW);
+		} else {
+			glBufferData(GL_ARRAY_BUFFER, sizeof(BatchVertexColored) * bdata.unit_vertices.size(), bdata.unit_vertices.get_unit(0), GL_DYNAMIC_DRAW);
+		}
 	} else {
-		glBufferData(GL_ARRAY_BUFFER, sizeof(BatchVertexColored) * bdata.vertices_colored.size(), bdata.vertices_colored.get_data(), GL_DYNAMIC_DRAW);
+		glBufferData(GL_ARRAY_BUFFER, sizeof(BatchVertexLightAngled) * bdata.unit_vertices.size(), bdata.unit_vertices.get_unit(0), GL_DYNAMIC_DRAW);
 	}

 	// might not be necessary
@ -251,10 +256,6 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 	int command_count = p_item->commands.size();
 	Item::Command *const *commands = p_item->commands.ptr();

-	// locals, might be more efficient in a register (check)
-	Vector2 texpixel_size = r_fill_state.texpixel_size;
-	const float uv_epsilon = bdata.settings_uv_contract_amount;
-
 	// checking the color for not being white makes it 92/90 times faster in the case where it is white
 	bool multiply_final_modulate = false;
 	if (!r_fill_state.use_hardware_transform && (r_fill_state.final_modulate != Color(1, 1, 1, 1))) {
@ -316,196 +317,21 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_

 				Item::CommandRect *rect = static_cast<Item::CommandRect *>(command);

-				bool change_batch = false;
+				// unoptimized - could this be done once per batch / batch texture?
+				bool send_light_angles = rect->normal_map != RID();

-				// conditions for creating a new batch
-				if (r_fill_state.curr_batch->type != Batch::BT_RECT) {
-					change_batch = true;
+				bool buffer_full = false;

-					// check for special case if there is only a single or small number of rects,
-					// in which case we will use the legacy default rect renderer
-					// because it is faster for single rects
-
-					// we only want to do this if not a joined item with more than 1 item,
-					// because joined items with more than 1, the command * will be incorrect
-					// NOTE - this is assuming that use_hardware_transform means that it is a non-joined item!!
-					// If that assumption is incorrect this will go horribly wrong.
-					if (bdata.settings_use_single_rect_fallback && r_fill_state.use_hardware_transform) {
-						bool is_single_rect = false;
-						int command_num_next = command_num + 1;
-						if (command_num_next < command_count) {
-							Item::Command *command_next = commands[command_num_next];
-							if ((command_next->type != Item::Command::TYPE_RECT) && (command_next->type != Item::Command::TYPE_TRANSFORM)) {
-								is_single_rect = true;
-							}
-						} else {
-							is_single_rect = true;
-						}
-						// if it is a rect on its own, do exactly the same as the default routine
-						if (is_single_rect) {
-							_prefill_default_batch(r_fill_state, command_num, *p_item);
-							break;
-						}
-					} // if use hardware transform
+				// the template params must be explicit for compilation,
+				// this forces building the multiple versions of the function.
+				if (send_light_angles) {
+					buffer_full = prefill_rect<true>(rect, r_fill_state, r_command_start, command_num, command_count, commands, p_item, multiply_final_modulate);
+				} else {
+					buffer_full = prefill_rect<false>(rect, r_fill_state, r_command_start, command_num, command_count, commands, p_item, multiply_final_modulate);
 				}

-				Color col = rect->modulate;
-				if (multiply_final_modulate) {
-					col *= r_fill_state.final_modulate;
-				}
-
-				// instead of doing all the texture preparation for EVERY rect,
-				// we build a list of texture combinations and do this once off.
-				// This means we have a potentially rather slow step to identify which texture combo
-				// using the RIDs.
-				int old_batch_tex_id = r_fill_state.batch_tex_id;
-				r_fill_state.batch_tex_id = _batch_find_or_create_tex(rect->texture, rect->normal_map, rect->flags & CANVAS_RECT_TILE, old_batch_tex_id);
-
-				// try to create vertices BEFORE creating a batch,
-				// because if the vertex buffer is full, we need to finish this
-				// function, draw what we have so far, and then start a new set of batches
-
-				// request FOUR vertices at a time, this is more efficient
-				BatchVertex *bvs = bdata.vertices.request(4);
-				if (!bvs) {
-					// run out of space in the vertex buffer .. finish this function and draw what we have so far
-					// return where we got to
-					r_command_start = command_num;
+				if (buffer_full)
 					return true;
-				}
-
-				// conditions for creating a new batch
-				if (old_batch_tex_id != r_fill_state.batch_tex_id) {
-					change_batch = true;
-				}
-
-				// we need to treat color change separately because we need to count these
-				// to decide whether to switch on the fly to colored vertices.
-				if (!r_fill_state.curr_batch->color.equals(col)) {
-					change_batch = true;
-					bdata.total_color_changes++;
-				}
-
-				if (change_batch) {
-					// put the tex pixel size  in a local (less verbose and can be a register)
-					const BatchTex &batchtex = bdata.batch_textures[r_fill_state.batch_tex_id];
-					batchtex.tex_pixel_size.to(texpixel_size);
-
-					if (bdata.settings_uv_contract) {
-						r_fill_state.contract_uvs = (batchtex.flags & VS::TEXTURE_FLAG_FILTER) == 0;
-					}
-
-					// need to preserve texpixel_size between items
-					r_fill_state.texpixel_size = texpixel_size;
-
-					// open new batch (this should never fail, it dynamically grows)
-					r_fill_state.curr_batch = _batch_request_new(false);
-
-					r_fill_state.curr_batch->type = Batch::BT_RECT;
-					r_fill_state.curr_batch->color.set(col);
-					r_fill_state.curr_batch->batch_texture_id = r_fill_state.batch_tex_id;
-					r_fill_state.curr_batch->first_command = command_num;
-					r_fill_state.curr_batch->num_commands = 1;
-					r_fill_state.curr_batch->first_quad = bdata.total_quads;
-				} else {
-					// we could alternatively do the count when closing a batch .. perhaps more efficient
-					r_fill_state.curr_batch->num_commands++;
-				}
-
-				// fill the quad geometry
-				Vector2 mins = rect->rect.position;
-
-				if (r_fill_state.transform_mode == TM_TRANSLATE) {
-					_software_transform_vertex(mins, r_fill_state.transform_combined);
-				}
-
-				Vector2 maxs = mins + rect->rect.size;
-
-				// just aliases
-				BatchVertex *bA = &bvs[0];
-				BatchVertex *bB = &bvs[1];
-				BatchVertex *bC = &bvs[2];
-				BatchVertex *bD = &bvs[3];
-
-				bA->pos.x = mins.x;
-				bA->pos.y = mins.y;
-
-				bB->pos.x = maxs.x;
-				bB->pos.y = mins.y;
-
-				bC->pos.x = maxs.x;
-				bC->pos.y = maxs.y;
-
-				bD->pos.x = mins.x;
-				bD->pos.y = maxs.y;
-
-				if (rect->rect.size.x < 0) {
-					SWAP(bA->pos, bB->pos);
-					SWAP(bC->pos, bD->pos);
-				}
-				if (rect->rect.size.y < 0) {
-					SWAP(bA->pos, bD->pos);
-					SWAP(bB->pos, bC->pos);
-				}
-
-				if (r_fill_state.transform_mode == TM_ALL) {
-					_software_transform_vertex(bA->pos, r_fill_state.transform_combined);
-					_software_transform_vertex(bB->pos, r_fill_state.transform_combined);
-					_software_transform_vertex(bC->pos, r_fill_state.transform_combined);
-					_software_transform_vertex(bD->pos, r_fill_state.transform_combined);
-				}
-
-				// uvs
-				Vector2 src_min;
-				Vector2 src_max;
-				if (rect->flags & CANVAS_RECT_REGION) {
-					src_min = rect->source.position;
-					src_max = src_min + rect->source.size;
-
-					src_min *= texpixel_size;
-					src_max *= texpixel_size;
-
-					// nudge offset for the maximum to prevent precision error on GPU reading into line outside the source rect
-					// this is very difficult to get right.
-					if (r_fill_state.contract_uvs) {
-						src_min.x += uv_epsilon;
-						src_min.y += uv_epsilon;
-						src_max.x -= uv_epsilon;
-						src_max.y -= uv_epsilon;
-					}
-				} else {
-					src_min = Vector2(0, 0);
-					src_max = Vector2(1, 1);
-				}
-
-				// 10% faster calculating the max first
-				Vector2 uvs[4] = {
-					src_min,
-					Vector2(src_max.x, src_min.y),
-					src_max,
-					Vector2(src_min.x, src_max.y),
-				};
-
-				if (rect->flags & CANVAS_RECT_TRANSPOSE) {
-					SWAP(uvs[1], uvs[3]);
-				}
-
-				if (rect->flags & CANVAS_RECT_FLIP_H) {
-					SWAP(uvs[0], uvs[1]);
-					SWAP(uvs[2], uvs[3]);
-				}
-				if (rect->flags & CANVAS_RECT_FLIP_V) {
-					SWAP(uvs[0], uvs[3]);
-					SWAP(uvs[1], uvs[2]);
-				}
-
-				bA->uv.set(uvs[0]);
-				bB->uv.set(uvs[1]);
-				bC->uv.set(uvs[2]);
-				bD->uv.set(uvs[3]);
-
-				// increment quad count
-				bdata.total_quads++;

 			} break;
 		}
@ -519,119 +345,29 @@ bool RasterizerCanvasGLES2::prefill_joined_item(FillState &r_fill_state, int &r_
 	return false;
 }

-// convert the stupidly high amount of batches (each with its own color)
-// to larger batches where the color is stored in the verts instead...
-// There is a trade off. Non colored verts are smaller so work faster, but
-// there comes a point where it is better to just use colored verts to avoid lots of
-// batches.
-void RasterizerCanvasGLES2::_batch_translate_to_colored() {
-	bdata.vertices_colored.reset();
-	bdata.batches_temp.reset();
-
-	// As the vertices_colored and batches_temp are 'mirrors' of the non-colored version,
-	// the sizes should be equal, and allocations should never fail. Hence the use of debug
-	// asserts to check program flow, these should not occur at runtime unless the allocation
-	// code has been altered.
-#ifdef DEBUG_ENABLED
-	CRASH_COND(bdata.vertices_colored.max_size() != bdata.vertices.max_size());
-	CRASH_COND(bdata.batches_temp.max_size() != bdata.batches.max_size());
-#endif
-
-	Color curr_col(-1.0, -1.0, -1.0, -1.0);
-
-	Batch *dest_batch = 0;
-
-	// translate the batches into vertex colored batches
-	for (int n = 0; n < bdata.batches.size(); n++) {
-		const Batch &source_batch = bdata.batches[n];
-
-		bool needs_new_batch = true;
-
-		if (dest_batch) {
-			if (dest_batch->type == source_batch.type) {
-				if (source_batch.type == Batch::BT_RECT) {
-					if (dest_batch->batch_texture_id == source_batch.batch_texture_id) {
-						// add to previous batch
-						dest_batch->num_commands += source_batch.num_commands;
-						needs_new_batch = false;
-
-						// create the colored verts (only if not default)
-						int first_vert = source_batch.first_quad * 4;
-						int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands);
-
-						for (int v = first_vert; v < end_vert; v++) {
-							const BatchVertex &bv = bdata.vertices[v];
-							BatchVertexColored *cv = bdata.vertices_colored.request();
-#ifdef DEBUG_ENABLED
-							CRASH_COND(!cv);
-#endif
-							cv->pos = bv.pos;
-							cv->uv = bv.uv;
-							cv->col = source_batch.color;
-						}
-					} // textures match
-				} else {
-					// default
-					// we can still join, but only under special circumstances
-					// does this ever happen? not sure at this stage, but left for future expansion
-					uint32_t source_last_command = source_batch.first_command + source_batch.num_commands;
-					if (source_last_command == dest_batch->first_command) {
-						dest_batch->num_commands += source_batch.num_commands;
-						needs_new_batch = false;
-					} // if the commands line up exactly
-				}
-			} // if both batches are the same type
-
-		} // if dest batch is valid
-
-		if (needs_new_batch) {
-			dest_batch = bdata.batches_temp.request();
-#ifdef DEBUG_ENABLED
-			CRASH_COND(!dest_batch);
-#endif
-
-			*dest_batch = source_batch;
-
-			// create the colored verts (only if not default)
-			if (source_batch.type != Batch::BT_DEFAULT) {
-				int first_vert = source_batch.first_quad * 4;
-				int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands);
-
-				for (int v = first_vert; v < end_vert; v++) {
-					const BatchVertex &bv = bdata.vertices[v];
-					BatchVertexColored *cv = bdata.vertices_colored.request();
-#ifdef DEBUG_ENABLED
-					CRASH_COND(!cv);
-#endif
-					cv->pos = bv.pos;
-					cv->uv = bv.uv;
-					cv->col = source_batch.color;
-				}
-			}
-		}
-	}
-
-	// copy the temporary batches to the master batch list (this could be avoided but it makes the code cleaner)
-	bdata.batches.copy_from(bdata.batches_temp);
-}
-
 void RasterizerCanvasGLES2::_batch_render_rects(const Batch &p_batch, RasterizerStorageGLES2::Material *p_material) {

 	ERR_FAIL_COND(p_batch.num_commands <= 0);

 	const bool &colored_verts = bdata.use_colored_vertices;
+	const bool &use_light_angles = bdata.use_light_angles;
+
 	int sizeof_vert;
-	if (!colored_verts) {
-		sizeof_vert = sizeof(BatchVertex);
+	if (!use_light_angles) {
+		if (!colored_verts) {
+			sizeof_vert = sizeof(BatchVertex);
+		} else {
+			sizeof_vert = sizeof(BatchVertexColored);
+		}
 	} else {
-		sizeof_vert = sizeof(BatchVertexColored);
+		sizeof_vert = sizeof(BatchVertexLightAngled);
 	}

 	// batch tex
 	const BatchTex &tex = bdata.batch_textures[p_batch.batch_texture_id];

 	// make sure to set all conditionals BEFORE binding the shader
-	state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+	_set_texture_rect_mode(false, use_light_angles);

 	// force repeat is set if non power of 2 texture, and repeat is needed if hardware doesn't support npot
 	if (tex.tile_mode == BatchTex::TILE_FORCE_REPEAT) {
@ -665,6 +401,11 @@ void RasterizerCanvasGLES2::_batch_render_rects(const Batch &p_batch, Rasterizer
 		glEnableVertexAttribArray(VS::ARRAY_COLOR);
 	}

+	if (use_light_angles) {
+		glVertexAttribPointer(VS::ARRAY_TANGENT, 1, GL_FLOAT, GL_FALSE, sizeof_vert, CAST_INT_TO_UCHAR_PTR(pointer + (8 * 4)));
+		glEnableVertexAttribArray(VS::ARRAY_TANGENT);
+	}
+
 	// We only want to set the GL wrapping mode if the texture is not already tiled (i.e. set in Import).
 	// This  is an optimization left over from the legacy renderer.
 	// If we DID set tiling in the API, and reverted to clamped, then the next draw using this texture
@ -707,8 +448,10 @@ void RasterizerCanvasGLES2::_batch_render_rects(const Batch &p_batch, Rasterizer
 		} break;
 	}

+	// could these have ifs?
 	glDisableVertexAttribArray(VS::ARRAY_TEX_UV);
 	glDisableVertexAttribArray(VS::ARRAY_COLOR);
+	glDisableVertexAttribArray(VS::ARRAY_TANGENT);

 	// may not be necessary .. state change optimization still TODO
 	glBindBuffer(GL_ARRAY_BUFFER, 0);
@ -848,7 +591,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite

 							Item::CommandLine *line = static_cast<Item::CommandLine *>(command);

-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+							_set_texture_rect_mode(false);
 							if (state.canvas_shader.bind()) {
 								_set_uniforms();
 								state.canvas_shader.use_material((void *)p_material);
@ -934,7 +677,17 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 							// To work it around, we use a simpler draw method which does not flicker, but gives
 							// a non negligible performance hit, so it's opt-in (GH-24466).
 							if (use_nvidia_rect_workaround) {
-								state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+
+								// are we using normal maps, if so we want to use light angle
+								bool send_light_angles = false;
+
+								// only need to use light angles when normal mapping
+								// otherwise we can use the default shader
+								if (state.current_normal != RID()) {
+									send_light_angles = true;
+								}
+
+								_set_texture_rect_mode(false, send_light_angles);

 								if (state.canvas_shader.bind()) {
 									_set_uniforms();
@ -971,6 +724,10 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 										src_rect.position + Vector2(0.0, src_rect.size.y),
 									};

+									// for encoding in light angle
+									bool flip_h = false;
+									bool flip_v = false;
+
 									if (r->flags & CANVAS_RECT_TRANSPOSE) {
 										SWAP(uvs[1], uvs[3]);
 									}
@ -978,10 +735,13 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 									if (r->flags & CANVAS_RECT_FLIP_H) {
 										SWAP(uvs[0], uvs[1]);
 										SWAP(uvs[2], uvs[3]);
+										flip_h = true;
+										flip_v = !flip_v;
 									}
 									if (r->flags & CANVAS_RECT_FLIP_V) {
 										SWAP(uvs[0], uvs[3]);
 										SWAP(uvs[1], uvs[2]);
+										flip_v = !flip_v;
 									}

 									state.canvas_shader.set_uniform(CanvasShaderGLES2::COLOR_TEXPIXEL_SIZE, texpixel_size);
@ -994,7 +754,33 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 										untile = true;
 									}

-									_draw_gui_primitive(4, points, NULL, uvs);
+									if (send_light_angles) {
+										// for single rects, there is no need to fully utilize the light angle,
+										// we only need it to encode flips (horz and vert). But the shader can be reused with
+										// batching in which case the angle encodes the transform as well as
+										// the flips.
+										// Note transpose is NYI. I don't think it worked either with the non-nvidia method.
+
+										// if horizontal flip, angle is 180
+										float angle = 0.0f;
+										if (flip_h)
+											angle = Math_PI;
+
+										// add 1 (to take care of zero floating point error with sign)
+										angle += 1.0f;
+
+										// flip if necessary
+										if (flip_v)
+											angle *= -1.0f;
+
+										// light angle must be sent for each vert, instead as a single uniform in the uniform draw method
+										// this has the benefit of enabling batching with light angles.
+										float light_angles[4] = { angle, angle, angle, angle };
+
+										_draw_gui_primitive(4, points, NULL, uvs, light_angles);
+									} else {
+										_draw_gui_primitive(4, points, NULL, uvs);
+									}

 									if (untile) {
 										glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
@ -1016,7 +802,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 								// This branch is better for performance, but can produce flicker on Nvidia, see above comment.
 								_bind_quad_buffer();

-								state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, true);
+								_set_texture_rect_mode(true);

 								if (state.canvas_shader.bind()) {
 									_set_uniforms();
@ -1104,7 +890,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite

 							Item::CommandNinePatch *np = static_cast<Item::CommandNinePatch *>(command);

-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+							_set_texture_rect_mode(false);
 							if (state.canvas_shader.bind()) {
 								_set_uniforms();
 								state.canvas_shader.use_material((void *)p_material);
@ -1280,7 +1066,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite

 							Item::CommandCircle *circle = static_cast<Item::CommandCircle *>(command);

-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+							_set_texture_rect_mode(false);

 							if (state.canvas_shader.bind()) {
 								_set_uniforms();
@ -1310,7 +1096,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite

 							Item::CommandPolygon *polygon = static_cast<Item::CommandPolygon *>(command);

-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+							_set_texture_rect_mode(false);

 							if (state.canvas_shader.bind()) {
 								_set_uniforms();
@ -1340,7 +1126,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 						case Item::Command::TYPE_MESH: {

 							Item::CommandMesh *mesh = static_cast<Item::CommandMesh *>(command);
-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+							_set_texture_rect_mode(false);

 							if (state.canvas_shader.bind()) {
 								_set_uniforms();
@ -1416,7 +1202,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite

 							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_INSTANCE_CUSTOM, multi_mesh->custom_data_format != VS::MULTIMESH_CUSTOM_DATA_NONE);
 							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_INSTANCING, true);
-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+							_set_texture_rect_mode(false);

 							if (state.canvas_shader.bind()) {
 								_set_uniforms();
@ -1520,7 +1306,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 								}
 							}

-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_INSTANCE_CUSTOM, false);
+							_set_texture_rect_mode(false);
 							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_INSTANCING, false);

 							storage->info.render._2d_draw_call_count++;
@ -1580,7 +1366,7 @@ void RasterizerCanvasGLES2::render_batches(Item::Command *const *p_commands, Ite
 						case Item::Command::TYPE_PRIMITIVE: {

 							Item::CommandPrimitive *primitive = static_cast<Item::CommandPrimitive *>(command);
-							state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, false);
+							_set_texture_rect_mode(false);

 							if (state.canvas_shader.bind()) {
 								_set_uniforms();
@ -1732,23 +1518,28 @@ void RasterizerCanvasGLES2::flush_render_batches(Item *p_first_item, Item *p_cur
 	// .. however probably not necessary
 	bdata.use_colored_vertices = false;

-	// only check whether to convert if there are quads (prevent divide by zero)
-	// and we haven't decided to prevent color baking (due to e.g. MODULATE
-	// being used in a shader)
-	if (bdata.total_quads && !(bdata.joined_item_batch_flags & RasterizerStorageGLES2::Shader::CanvasItem::PREVENT_COLOR_BAKING)) {
-		// minus 1 to prevent single primitives (ratio 1.0) always being converted to colored..
-		// in that case it is slightly cheaper to just have the color as part of the batch
-		float ratio = (float)(bdata.total_color_changes - 1) / (float)bdata.total_quads;
+	if (bdata.use_light_angles) {
+		_translate_batches_to_larger_FVF<BatchVertexLightAngled, true>();
+	} else {
+		// only check whether to convert if there are quads (prevent divide by zero)
+		// and we haven't decided to prevent color baking (due to e.g. MODULATE
+		// being used in a shader)
+		if (bdata.total_quads && !(bdata.joined_item_batch_flags & RasterizerStorageGLES2::Shader::CanvasItem::PREVENT_COLOR_BAKING)) {
+			// minus 1 to prevent single primitives (ratio 1.0) always being converted to colored..
+			// in that case it is slightly cheaper to just have the color as part of the batch
+			float ratio = (float)(bdata.total_color_changes - 1) / (float)bdata.total_quads;

-		// use bigger than or equal so that 0.0 threshold can force always using colored verts
-		if (ratio >= bdata.settings_colored_vertex_format_threshold) {
-			bdata.use_colored_vertices = true;
+			// use bigger than or equal so that 0.0 threshold can force always using colored verts
+			if (ratio >= bdata.settings_colored_vertex_format_threshold) {
+				bdata.use_colored_vertices = true;

-			// small perf cost versus going straight to colored verts (maybe around 10%)
-			// however more straightforward
-			_batch_translate_to_colored();
+				// small perf cost versus going straight to colored verts (maybe around 10%)
+				// however more straightforward
+				_translate_batches_to_larger_FVF<BatchVertexColored, false>();
+				//_batch_translate_to_colored();
+			}
 		}
-	}
+	} // if not using light angles

 	// send buffers to opengl
 	_batch_upload_buffers();
@ -3517,9 +3308,12 @@ void RasterizerCanvasGLES2::initialize() {
 	bdata.vertex_buffer_size_bytes = bdata.vertex_buffer_size_units * sizeof_batch_vert;
 	bdata.index_buffer_size_bytes = bdata.index_buffer_size_units * 2; // 16 bit inds

-	// create equal number of norma and colored verts (as the normal may need to be translated to colored)
+	// create equal number of normal and (max) unit sized verts (as the normal may need to be translated to a larger FVF)
 	bdata.vertices.create(bdata.vertex_buffer_size_units); // 512k
-	bdata.vertices_colored.create(bdata.vertices.max_size()); // 1024k
+	bdata.unit_vertices.create(bdata.vertices.max_size(), sizeof(BatchVertexLightAngled));
+
+	// extra data per vert needed for larger FVFs
+	bdata.light_angles.create(bdata.vertices.max_size());

 	// num batches will be auto increased dynamically if required
 	bdata.batches.create(1024);
--- a/drivers/gles2/rasterizer_canvas_gles2.h
+++ b/drivers/gles2/rasterizer_canvas_gles2.h
@ -89,6 +89,11 @@ class RasterizerCanvasGLES2 : public RasterizerCanvasBaseGLES2 {
 		BatchColor col;
 	};

+	struct BatchVertexLightAngled : public BatchVertexColored {
+		// must be pod
+		float light_angle;
+	};
+
 	struct Batch {
 		enum CommandType : uint32_t {
 			BT_DEFAULT,
@ -167,10 +172,13 @@ class RasterizerCanvasGLES2 : public RasterizerCanvasBaseGLES2 {
 		void reset_flush() {
 			batches.reset();
 			batch_textures.reset();
+
 			vertices.reset();
+			light_angles.reset();

 			total_quads = 0;
 			total_color_changes = 0;
+			use_light_angles = false;
 		}

 		GLuint gl_vertex_buffer;
@ -182,13 +190,28 @@ class RasterizerCanvasGLES2 : public RasterizerCanvasBaseGLES2 {
 		uint32_t index_buffer_size_units;
 		uint32_t index_buffer_size_bytes;

+		// small vertex FVF type - pos and UV.
+		// This will always be written to initially, but can be translated
+		// to larger FVFs if necessary.
 		RasterizerArrayGLES2<BatchVertex> vertices;
-		RasterizerArrayGLES2<BatchVertexColored> vertices_colored;
+
+		// extra data which can be stored during prefilling, for later translation to larger FVFs
+		RasterizerArrayGLES2<float> light_angles;
+
+		// instead of having a different buffer for each vertex FVF type
+		// we have a special array big enough for the biggest FVF
+		// which can have a changeable unit size, and reuse it.
+		RasterizerUnitArrayGLES2 unit_vertices;
+
 		RasterizerArrayGLES2<Batch> batches;
 		RasterizerArrayGLES2<Batch> batches_temp; // used for translating to colored vertex batches
 		RasterizerArray_non_pod_GLES2<BatchTex> batch_textures; // the only reason this is non-POD is because of RIDs

+		// flexible vertex format.
+		// all verts have pos and UV.
+		// some have color, some light angles etc.
 		bool use_colored_vertices;
+		bool use_light_angles;

 		RasterizerArrayGLES2<BItemJoined> items_joined;
 		RasterizerArrayGLES2<BItemRef> item_refs;
@ -321,11 +344,12 @@ private:
 	bool try_join_item(Item *p_ci, RenderItemState &r_ris, bool &r_batch_break);
 	void render_joined_item_commands(const BItemJoined &p_bij, Item *p_current_clip, bool &r_reclip, RasterizerStorageGLES2::Material *p_material, bool p_lit);
 	void render_batches(Item::Command *const *p_commands, Item *p_current_clip, bool &r_reclip, RasterizerStorageGLES2::Material *p_material);
+
 	bool prefill_joined_item(FillState &r_fill_state, int &r_command_start, Item *p_item, Item *p_current_clip, bool &r_reclip, RasterizerStorageGLES2::Material *p_material);
+
 	void flush_render_batches(Item *p_first_item, Item *p_current_clip, bool &r_reclip, RasterizerStorageGLES2::Material *p_material);

 	// low level batch funcs
-	void _batch_translate_to_colored();
 	int _batch_find_or_create_tex(const RID &p_texture, const RID &p_normal, bool p_tile, int p_previous_match);
 	RasterizerStorageGLES2::Texture *_get_canvas_texture(const RID &p_texture) const;
 	void _batch_upload_buffers();
@ -358,6 +382,13 @@ private:
 public:
 	void initialize();
 	RasterizerCanvasGLES2();
+
+private:
+	template <bool SEND_LIGHT_ANGLES>
+	bool prefill_rect(Item::CommandRect *rect, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, Item::Command *const *commands, Item *p_item, bool multiply_final_modulate);
+
+	template <class BATCH_VERTEX_TYPE, bool INCLUDE_LIGHT_ANGLES>
+	void _translate_batches_to_larger_FVF();
 };

 //////////////////////////////////////////////////////////////
@ -492,4 +523,407 @@ inline bool RasterizerCanvasGLES2::_sort_items_match(const BSortItem &p_a, const
 	return true;
 }

+//////////////////////////////////////////////////////////////
+// TEMPLATE FUNCS
+
+// Translation always involved adding color to the FVF, which enables
+// joining of batches that have different colors.
+// There is a trade off. Non colored verts are smaller so work faster, but
+// there comes a point where it is better to just use colored verts to avoid lots of
+// batches.
+// In addition this can optionally add light angles to the FVF, necessary for normal mapping.
+template <class BATCH_VERTEX_TYPE, bool INCLUDE_LIGHT_ANGLES>
+void RasterizerCanvasGLES2::_translate_batches_to_larger_FVF() {
+
+	// zeros the size and sets up how big each unit is
+	bdata.unit_vertices.prepare(sizeof(BATCH_VERTEX_TYPE));
+	bdata.batches_temp.reset();
+
+	// As the vertices_colored and batches_temp are 'mirrors' of the non-colored version,
+	// the sizes should be equal, and allocations should never fail. Hence the use of debug
+	// asserts to check program flow, these should not occur at runtime unless the allocation
+	// code has been altered.
+#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED)
+	CRASH_COND(bdata.unit_vertices.max_size() != bdata.vertices.max_size());
+	CRASH_COND(bdata.batches_temp.max_size() != bdata.batches.max_size());
+#endif
+
+	Color curr_col(-1.0, -1.0, -1.0, -1.0);
+
+	Batch *dest_batch = 0;
+
+	const float *source_light_angles = &bdata.light_angles[0];
+
+	// translate the batches into vertex colored batches
+	for (int n = 0; n < bdata.batches.size(); n++) {
+		const Batch &source_batch = bdata.batches[n];
+
+		// does source batch use light angles?
+		const BatchTex &btex = bdata.batch_textures[source_batch.batch_texture_id];
+		bool source_batch_uses_light_angles = btex.RID_normal != RID();
+
+		bool needs_new_batch = true;
+
+		if (dest_batch) {
+			if (dest_batch->type == source_batch.type) {
+				if (source_batch.type == Batch::BT_RECT) {
+					if (dest_batch->batch_texture_id == source_batch.batch_texture_id) {
+						// add to previous batch
+						dest_batch->num_commands += source_batch.num_commands;
+						needs_new_batch = false;
+
+						// create the colored verts (only if not default)
+						int first_vert = source_batch.first_quad * 4;
+						int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands);
+
+						for (int v = first_vert; v < end_vert; v++) {
+							const BatchVertex &bv = bdata.vertices[v];
+							BATCH_VERTEX_TYPE *cv = (BatchVertexLightAngled *)bdata.unit_vertices.request();
+#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED)
+							CRASH_COND(!cv);
+#endif
+							cv->pos = bv.pos;
+							cv->uv = bv.uv;
+							cv->col = source_batch.color;
+
+							if (INCLUDE_LIGHT_ANGLES) {
+								// this is required to allow compilation with non light angle vertex.
+								// it should be compiled out.
+								BatchVertexLightAngled *lv = (BatchVertexLightAngled *)cv;
+								if (source_batch_uses_light_angles)
+									lv->light_angle = *source_light_angles++;
+								else
+									lv->light_angle = 0.0f; // dummy, unused in vertex shader (could possibly be left uninitialized, but probably bad idea)
+							}
+						}
+					} // textures match
+				} else {
+					// default
+					// we can still join, but only under special circumstances
+					// does this ever happen? not sure at this stage, but left for future expansion
+					uint32_t source_last_command = source_batch.first_command + source_batch.num_commands;
+					if (source_last_command == dest_batch->first_command) {
+						dest_batch->num_commands += source_batch.num_commands;
+						needs_new_batch = false;
+					} // if the commands line up exactly
+				}
+			} // if both batches are the same type
+
+		} // if dest batch is valid
+
+		if (needs_new_batch) {
+			dest_batch = bdata.batches_temp.request();
+#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED)
+			CRASH_COND(!dest_batch);
+#endif
+
+			*dest_batch = source_batch;
+
+			// create the colored verts (only if not default)
+			if (source_batch.type != Batch::BT_DEFAULT) {
+				int first_vert = source_batch.first_quad * 4;
+				int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands);
+
+				for (int v = first_vert; v < end_vert; v++) {
+					const BatchVertex &bv = bdata.vertices[v];
+					BATCH_VERTEX_TYPE *cv = (BatchVertexLightAngled *)bdata.unit_vertices.request();
+#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED)
+					CRASH_COND(!cv);
+#endif
+					cv->pos = bv.pos;
+					cv->uv = bv.uv;
+					cv->col = source_batch.color;
+
+					if (INCLUDE_LIGHT_ANGLES) {
+						// this is required to allow compilation with non light angle vertex.
+						// it should be compiled out.
+						BatchVertexLightAngled *lv = (BatchVertexLightAngled *)cv;
+						if (source_batch_uses_light_angles)
+							lv->light_angle = *source_light_angles++;
+						else
+							lv->light_angle = 0.0f; // dummy, unused in vertex shader (could possibly be left uninitialized, but probably bad idea)
+					} // if using light angles
+				}
+			}
+		}
+	}
+
+	// copy the temporary batches to the master batch list (this could be avoided but it makes the code cleaner)
+	bdata.batches.copy_from(bdata.batches_temp);
+}
+
+// return true if buffer full up, else return false
+template <bool SEND_LIGHT_ANGLES>
+bool RasterizerCanvasGLES2::prefill_rect(Item::CommandRect *rect, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, Item::Command *const *commands, Item *p_item, bool multiply_final_modulate) {
+	bool change_batch = false;
+
+	// conditions for creating a new batch
+	if (r_fill_state.curr_batch->type != Batch::BT_RECT) {
+		change_batch = true;
+
+		// check for special case if there is only a single or small number of rects,
+		// in which case we will use the legacy default rect renderer
+		// because it is faster for single rects
+
+		// we only want to do this if not a joined item with more than 1 item,
+		// because joined items with more than 1, the command * will be incorrect
+		// NOTE - this is assuming that use_hardware_transform means that it is a non-joined item!!
+		// If that assumption is incorrect this will go horribly wrong.
+		if (bdata.settings_use_single_rect_fallback && r_fill_state.use_hardware_transform) {
+			bool is_single_rect = false;
+			int command_num_next = command_num + 1;
+			if (command_num_next < command_count) {
+				Item::Command *command_next = commands[command_num_next];
+				if ((command_next->type != Item::Command::TYPE_RECT) && (command_next->type != Item::Command::TYPE_TRANSFORM)) {
+					is_single_rect = true;
+				}
+			} else {
+				is_single_rect = true;
+			}
+			// if it is a rect on its own, do exactly the same as the default routine
+			if (is_single_rect) {
+				_prefill_default_batch(r_fill_state, command_num, *p_item);
+				return false;
+			}
+		} // if use hardware transform
+	}
+
+	Color col = rect->modulate;
+	if (multiply_final_modulate) {
+		col *= r_fill_state.final_modulate;
+	}
+
+	// instead of doing all the texture preparation for EVERY rect,
+	// we build a list of texture combinations and do this once off.
+	// This means we have a potentially rather slow step to identify which texture combo
+	// using the RIDs.
+	int old_batch_tex_id = r_fill_state.batch_tex_id;
+	r_fill_state.batch_tex_id = _batch_find_or_create_tex(rect->texture, rect->normal_map, rect->flags & CANVAS_RECT_TILE, old_batch_tex_id);
+
+	//r_fill_state.use_light_angles = send_light_angles;
+	if (SEND_LIGHT_ANGLES)
+		bdata.use_light_angles = true;
+
+	// try to create vertices BEFORE creating a batch,
+	// because if the vertex buffer is full, we need to finish this
+	// function, draw what we have so far, and then start a new set of batches
+
+	// request FOUR vertices at a time, this is more efficient
+	BatchVertex *bvs = bdata.vertices.request(4);
+	if (!bvs) {
+		// run out of space in the vertex buffer .. finish this function and draw what we have so far
+		// return where we got to
+		r_command_start = command_num;
+		return true;
+	}
+
+	// conditions for creating a new batch
+	if (old_batch_tex_id != r_fill_state.batch_tex_id) {
+		change_batch = true;
+	}
+
+	// we need to treat color change separately because we need to count these
+	// to decide whether to switch on the fly to colored vertices.
+	if (!r_fill_state.curr_batch->color.equals(col)) {
+		change_batch = true;
+		bdata.total_color_changes++;
+	}
+
+	if (change_batch) {
+		// put the tex pixel size  in a local (less verbose and can be a register)
+		const BatchTex &batchtex = bdata.batch_textures[r_fill_state.batch_tex_id];
+		batchtex.tex_pixel_size.to(r_fill_state.texpixel_size);
+
+		if (bdata.settings_uv_contract) {
+			r_fill_state.contract_uvs = (batchtex.flags & VS::TEXTURE_FLAG_FILTER) == 0;
+		}
+
+		// need to preserve texpixel_size between items
+		r_fill_state.texpixel_size = r_fill_state.texpixel_size;
+
+		// open new batch (this should never fail, it dynamically grows)
+		r_fill_state.curr_batch = _batch_request_new(false);
+
+		r_fill_state.curr_batch->type = Batch::BT_RECT;
+		r_fill_state.curr_batch->color.set(col);
+		r_fill_state.curr_batch->batch_texture_id = r_fill_state.batch_tex_id;
+		r_fill_state.curr_batch->first_command = command_num;
+		r_fill_state.curr_batch->num_commands = 1;
+		r_fill_state.curr_batch->first_quad = bdata.total_quads;
+	} else {
+		// we could alternatively do the count when closing a batch .. perhaps more efficient
+		r_fill_state.curr_batch->num_commands++;
+	}
+
+	// fill the quad geometry
+	Vector2 mins = rect->rect.position;
+
+	if (r_fill_state.transform_mode == TM_TRANSLATE) {
+		_software_transform_vertex(mins, r_fill_state.transform_combined);
+	}
+
+	Vector2 maxs = mins + rect->rect.size;
+
+	// just aliases
+	BatchVertex *bA = &bvs[0];
+	BatchVertex *bB = &bvs[1];
+	BatchVertex *bC = &bvs[2];
+	BatchVertex *bD = &bvs[3];
+
+	bA->pos.x = mins.x;
+	bA->pos.y = mins.y;
+
+	bB->pos.x = maxs.x;
+	bB->pos.y = mins.y;
+
+	bC->pos.x = maxs.x;
+	bC->pos.y = maxs.y;
+
+	bD->pos.x = mins.x;
+	bD->pos.y = maxs.y;
+
+	// possibility of applying flips here for normal mapping .. but they don't seem to be used
+	if (rect->rect.size.x < 0) {
+		SWAP(bA->pos, bB->pos);
+		SWAP(bC->pos, bD->pos);
+	}
+	if (rect->rect.size.y < 0) {
+		SWAP(bA->pos, bD->pos);
+		SWAP(bB->pos, bC->pos);
+	}
+
+	if (r_fill_state.transform_mode == TM_ALL) {
+		_software_transform_vertex(bA->pos, r_fill_state.transform_combined);
+		_software_transform_vertex(bB->pos, r_fill_state.transform_combined);
+		_software_transform_vertex(bC->pos, r_fill_state.transform_combined);
+		_software_transform_vertex(bD->pos, r_fill_state.transform_combined);
+	}
+
+	// uvs
+	Vector2 src_min;
+	Vector2 src_max;
+	if (rect->flags & CANVAS_RECT_REGION) {
+		src_min = rect->source.position;
+		src_max = src_min + rect->source.size;
+
+		src_min *= r_fill_state.texpixel_size;
+		src_max *= r_fill_state.texpixel_size;
+
+		const float uv_epsilon = bdata.settings_uv_contract_amount;
+
+		// nudge offset for the maximum to prevent precision error on GPU reading into line outside the source rect
+		// this is very difficult to get right.
+		if (r_fill_state.contract_uvs) {
+			src_min.x += uv_epsilon;
+			src_min.y += uv_epsilon;
+			src_max.x -= uv_epsilon;
+			src_max.y -= uv_epsilon;
+		}
+	} else {
+		src_min = Vector2(0, 0);
+		src_max = Vector2(1, 1);
+	}
+
+	// 10% faster calculating the max first
+	Vector2 uvs[4] = {
+		src_min,
+		Vector2(src_max.x, src_min.y),
+		src_max,
+		Vector2(src_min.x, src_max.y),
+	};
+
+	// for encoding in light angle
+	// flips should be optimized out when not being used for light angle.
+	bool flip_h = false;
+	bool flip_v = false;
+
+	if (rect->flags & CANVAS_RECT_TRANSPOSE) {
+		SWAP(uvs[1], uvs[3]);
+	}
+
+	if (rect->flags & CANVAS_RECT_FLIP_H) {
+		SWAP(uvs[0], uvs[1]);
+		SWAP(uvs[2], uvs[3]);
+		flip_h = !flip_h;
+		flip_v = !flip_v;
+	}
+	if (rect->flags & CANVAS_RECT_FLIP_V) {
+		SWAP(uvs[0], uvs[3]);
+		SWAP(uvs[1], uvs[2]);
+		flip_v = !flip_v;
+	}
+
+	bA->uv.set(uvs[0]);
+	bB->uv.set(uvs[1]);
+	bC->uv.set(uvs[2]);
+	bD->uv.set(uvs[3]);
+
+	if (SEND_LIGHT_ANGLES) {
+		// we can either keep the light angles in sync with the verts when writing,
+		// or sync them up during translation. We are syncing in translation.
+		// N.B. There may be batches that don't require light_angles between batches that do.
+		float *angles = bdata.light_angles.request(4);
+#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED)
+		CRASH_COND(angles == nullptr);
+#endif
+
+		float angle = 0.0f;
+		const float TWO_PI = Math_PI * 2;
+
+		if (r_fill_state.transform_mode != TM_NONE) {
+
+			const Transform2D &tr = r_fill_state.transform_combined;
+
+			// apply to an x axis
+			// the x axis and y axis can be taken directly from the transform (no need to xform identity vectors)
+			Vector2 x_axis(tr.elements[0][0], tr.elements[1][0]);
+
+			// have to do a y axis to check for scaling flips
+			// this is hassle and extra slowness. We could only allow flips via the flags.
+			Vector2 y_axis(tr.elements[0][1], tr.elements[1][1]);
+
+			// has the x / y axis flipped due to scaling?
+			float cross = x_axis.cross(y_axis);
+			if (cross < 0.0f) {
+				flip_v = !flip_v;
+			}
+
+			// passing an angle is smaller than a vector, it can be reconstructed in the shader
+			angle = x_axis.angle();
+
+			// we don't want negative angles, as negative is used to encode flips.
+			// This moves range from -PI to PI to 0 to TWO_PI
+			if (angle < 0.0f)
+				angle += TWO_PI;
+
+		} // if transform needed
+
+		// if horizontal flip, angle is shifted by 180 degrees
+		if (flip_h) {
+			angle += Math_PI;
+
+			// mod to get back to 0 to TWO_PI range
+			angle = fmodf(angle, TWO_PI);
+		}
+
+		// add 1 (to take care of zero floating point error with sign)
+		angle += 1.0f;
+
+		// flip if necessary to indicate a vertical flip in the shader
+		if (flip_v)
+			angle *= -1.0f;
+
+		// light angle must be sent for each vert, instead as a single uniform in the uniform draw method
+		// this has the benefit of enabling batching with light angles.
+		for (int n = 0; n < 4; n++) {
+			angles[n] = angle;
+		}
+	}
+
+	// increment quad count
+	bdata.total_quads++;
+
+	return false;
+}
+
 #endif // RASTERIZERCANVASGLES2_H
--- a/drivers/gles2/rasterizer_gles2.cpp
+++ b/drivers/gles2/rasterizer_gles2.cpp
@ -407,7 +407,7 @@ void RasterizerGLES2::blit_render_target_to_screen(RID p_render_target, const Re
 	RasterizerStorageGLES2::RenderTarget *rt = storage->render_target_owner.getornull(p_render_target);
 	ERR_FAIL_COND(!rt);

-	canvas->state.canvas_shader.set_conditional(CanvasShaderGLES2::USE_TEXTURE_RECT, true);
+	canvas->_set_texture_rect_mode(true);

 	canvas->state.canvas_shader.set_custom_shader(0);
 	canvas->state.canvas_shader.bind();
--- a/drivers/gles2/shaders/canvas.glsl
+++ b/drivers/gles2/shaders/canvas.glsl
@ -18,6 +18,12 @@ uniform highp mat4 projection_matrix;
 uniform highp mat4 modelview_matrix;
 uniform highp mat4 extra_matrix;
 attribute highp vec2 vertex; // attrib:0
+
+#ifdef USE_LIGHT_ANGLE
+// shared with tangent, not used in canvas shader
+attribute highp float light_angle; // attrib:2
+#endif
+
 attribute vec4 color_attrib; // attrib:3
 attribute vec2 uv_attrib; // attrib:4

@ -219,12 +225,34 @@ VERTEX_SHADER_CODE
 	pos = outvec.xy;
 #endif

+#ifdef USE_LIGHT_ANGLE
+	// we add a fixed offset because we are using the sign later,
+	// and don't want floating point error around 0.0
+	float la = abs(light_angle) - 1.0;
+
+	// vector light angle
+	vec4 vla;
+	vla.xy = vec2(cos(la), sin(la));
+	vla.zw = vec2(-vla.y, vla.x);
+
+	// vertical flip encoded in the sign
+	vla.zw *= sign(light_angle);
+
+	// apply the transform matrix.
+	// The rotate will be encoded in the transform matrix for single rects,
+	// and just the flips in the light angle.
+	// For batching we will encode the rotation and the flips
+	// in the light angle, and can use the same shader.
+	local_rot.xy = normalize((modelview_matrix * (extra_matrix_instance * vec4(vla.xy, 0.0, 0.0))).xy);
+	local_rot.zw = normalize((modelview_matrix * (extra_matrix_instance * vec4(vla.zw, 0.0, 0.0))).xy);
+#else
 	local_rot.xy = normalize((modelview_matrix * (extra_matrix_instance * vec4(1.0, 0.0, 0.0, 0.0))).xy);
 	local_rot.zw = normalize((modelview_matrix * (extra_matrix_instance * vec4(0.0, 1.0, 0.0, 0.0))).xy);
 #ifdef USE_TEXTURE_RECT
 	local_rot.xy *= sign(src_rect.z);
 	local_rot.zw *= sign(src_rect.w);
 #endif
+#endif // not using light angle

 #endif
 }