diff --git a/misc/dist/docker/README.md b/misc/dist/docker/README.md index 7f10b46ad8..71aac8a77c 100644 --- a/misc/dist/docker/README.md +++ b/misc/dist/docker/README.md @@ -1,40 +1,40 @@ -## A Docker image to build Linux, Windows and Android godot binaries. - -The main reason to write this, is to provide a simple way in all platforms to integrate external godot modules and build a custom version of godot. - -## usage -1. Install docker on Linux or docker toolbox on Windows or Mac. -2. Open a terminal on linux or "Docker Quickstart Terminal" on Windows or Mac. -3. Run command: - - Linux: `cd` - - Windows: `cd /c/Users/YOUR_USERNAME` - - Mac: `cd /Users/YOUR_USERNAME` -4. Get godot source code: `git clone https://github.com/godotengine/godot.git` -5. Run command: `cd godot/tools/docker` -6. Run command: `docker build -t godot .`(In Linux run Docker commands with `sudo` or add your user to docker group before run the Docker commands). The godot docker image will be build after a while. -7. Run command: - - Linux: `docker run -it --name=godot-dev -v /home/YOUR_USERNAME/godot:/godot-dev/godot godot` - - Windows: `docker run -it --name=godot-dev -v /c/Users/YOUR_USERNAME/godot:/godot-dev/godot godot` - - Mac: `docker run -it --name=godot-dev -v /Users/YOUR_USERNAME/godot:/godot-dev/godot godot` - You are in the godot-dev container and /godot-dev directory now. -8. Run `./install-android-tools` to download and install all android development tools. -9. Run command: `source ~/.bashrc` -10. Run command: `cd godot` -11. Run command: `scons p=android target=release` to test everything is ok. You can set platform to x11, windows, android, haiku and server. - -After use and exit, you can use this environment again by open terminal and type commands: `docker start godot-dev && docker attach godot-dev`. - -### Windows and Mac stuffs: - -- Speed up compilation: - - Exit from container. - - Run command: `docker-machine stop` - - Open "Oracle VM VirtualBox". - - In settings of default VM increase CPU cores and RAM to suitable values. - - Run command: `docker-machine start` - - Run command: `docker start godot-dev && docker attach godot-dev` - -- ssh to VM(can be useful sometimes): - - `docker-machine ssh` - -Check docker and boot2docker projects for more details. +## A Docker image to build Linux, Windows and Android godot binaries. + +The main reason to write this, is to provide a simple way in all platforms to integrate external godot modules and build a custom version of godot. + +## usage +1. Install docker on Linux or docker toolbox on Windows or Mac. +2. Open a terminal on linux or "Docker Quickstart Terminal" on Windows or Mac. +3. Run command: + - Linux: `cd` + - Windows: `cd /c/Users/YOUR_USERNAME` + - Mac: `cd /Users/YOUR_USERNAME` +4. Get godot source code: `git clone https://github.com/godotengine/godot.git` +5. Run command: `cd godot/tools/docker` +6. Run command: `docker build -t godot .`(In Linux run Docker commands with `sudo` or add your user to docker group before run the Docker commands). The godot docker image will be build after a while. +7. Run command: + - Linux: `docker run -it --name=godot-dev -v /home/YOUR_USERNAME/godot:/godot-dev/godot godot` + - Windows: `docker run -it --name=godot-dev -v /c/Users/YOUR_USERNAME/godot:/godot-dev/godot godot` + - Mac: `docker run -it --name=godot-dev -v /Users/YOUR_USERNAME/godot:/godot-dev/godot godot` + You are in the godot-dev container and /godot-dev directory now. +8. Run `./install-android-tools` to download and install all android development tools. +9. Run command: `source ~/.bashrc` +10. Run command: `cd godot` +11. Run command: `scons p=android target=release` to test everything is ok. You can set platform to x11, windows, android, haiku and server. + +After use and exit, you can use this environment again by open terminal and type commands: `docker start godot-dev && docker attach godot-dev`. + +### Windows and Mac stuffs: + +- Speed up compilation: + - Exit from container. + - Run command: `docker-machine stop` + - Open "Oracle VM VirtualBox". + - In settings of default VM increase CPU cores and RAM to suitable values. + - Run command: `docker-machine start` + - Run command: `docker start godot-dev && docker attach godot-dev` + +- ssh to VM(can be useful sometimes): + - `docker-machine ssh` + +Check docker and boot2docker projects for more details. diff --git a/misc/dist/uwp_template/AppxManifest.xml b/misc/dist/uwp_template/AppxManifest.xml index d5e653708c..cf26387f22 100644 --- a/misc/dist/uwp_template/AppxManifest.xml +++ b/misc/dist/uwp_template/AppxManifest.xml @@ -1,32 +1,32 @@ - - - - - - $display_name$ - $publisher_display_name$ - Assets\StoreLogo.png - - - - - - - - - - - - - $name_on_tiles$ - - - $rotation_preference$ - - - - $capabilities_place$ - - - + + + + + + $display_name$ + $publisher_display_name$ + Assets\StoreLogo.png + + + + + + + + + + + + + $name_on_tiles$ + + + $rotation_preference$ + + + + $capabilities_place$ + + + \ No newline at end of file diff --git a/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln b/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln index 7eabcdff5d..5f7d0e8a39 100644 --- a/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln +++ b/modules/mono/editor/GodotSharpTools/GodotSharpTools.sln @@ -1,17 +1,17 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 2012 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GodotSharpTools", "GodotSharpTools.csproj", "{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.Build.0 = Debug|Any CPU - {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.ActiveCfg = Release|Any CPU - {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection -EndGlobal + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GodotSharpTools", "GodotSharpTools.csproj", "{A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A8CDAD94-C6D4-4B19-A7E7-76C53CC92984}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/modules/mono/glue/cs_files/Color.cs b/modules/mono/glue/cs_files/Color.cs index df88a46832..0a00f83d47 100644 --- a/modules/mono/glue/cs_files/Color.cs +++ b/modules/mono/glue/cs_files/Color.cs @@ -1,590 +1,590 @@ -using System; - -namespace Godot -{ - public struct Color : IEquatable - { - public float r; - public float g; - public float b; - public float a; - - public int r8 - { - get - { - return (int)(r * 255.0f); - } - } - - public int g8 - { - get - { - return (int)(g * 255.0f); - } - } - - public int b8 - { - get - { - return (int)(b * 255.0f); - } - } - - public int a8 - { - get - { - return (int)(a * 255.0f); - } - } - - public float h - { - get - { - float max = Mathf.max(r, Mathf.max(g, b)); - float min = Mathf.min(r, Mathf.min(g, b)); - - float delta = max - min; - - if (delta == 0) - return 0; - - float h; - - if (r == max) - h = (g - b) / delta; // Between yellow & magenta - else if (g == max) - h = 2 + (b - r) / delta; // Between cyan & yellow - else - h = 4 + (r - g) / delta; // Between magenta & cyan - - h /= 6.0f; - - if (h < 0) - h += 1.0f; - - return h; - } - set - { - this = from_hsv(value, s, v); - } - } - - public float s - { - get - { - float max = Mathf.max(r, Mathf.max(g, b)); - float min = Mathf.min(r, Mathf.min(g, b)); - - float delta = max - min; - - return max != 0 ? delta / max : 0; - } - set - { - this = from_hsv(h, value, v); - } - } - - public float v - { - get - { - return Mathf.max(r, Mathf.max(g, b)); - } - set - { - this = from_hsv(h, s, value); - } - } - - private static readonly Color black = new Color(0f, 0f, 0f, 1.0f); - - public Color Black - { - get - { - return black; - } - } - - public float this [int index] - { - get - { - switch (index) - { - case 0: - return r; - case 1: - return g; - case 2: - return b; - case 3: - return a; - default: - throw new IndexOutOfRangeException(); - } - } - set - { - switch (index) - { - case 0: - r = value; - return; - case 1: - g = value; - return; - case 2: - b = value; - return; - case 3: - a = value; - return; - default: - throw new IndexOutOfRangeException(); - } - } - } - - public static void to_hsv(Color color, out float hue, out float saturation, out float value) - { - int max = Mathf.max(color.r8, Mathf.max(color.g8, color.b8)); - int min = Mathf.min(color.r8, Mathf.min(color.g8, color.b8)); - - float delta = max - min; - - if (delta == 0) - { - hue = 0; - } - else - { - if (color.r == max) - hue = (color.g - color.b) / delta; // Between yellow & magenta - else if (color.g == max) - hue = 2 + (color.b - color.r) / delta; // Between cyan & yellow - else - hue = 4 + (color.r - color.g) / delta; // Between magenta & cyan - - hue /= 6.0f; - - if (hue < 0) - hue += 1.0f; - } - - saturation = (max == 0) ? 0 : 1f - (1f * min / max); - value = max / 255f; - } - - public static Color from_hsv(float hue, float saturation, float value, float alpha = 1.0f) - { - if (saturation == 0) - { - // acp_hromatic (grey) - return new Color(value, value, value, alpha); - } - - int i; - float f, p, q, t; - - hue *= 6.0f; - hue %= 6f; - i = (int)hue; - - f = hue - i; - p = value * (1 - saturation); - q = value * (1 - saturation * f); - t = value * (1 - saturation * (1 - f)); - - switch (i) - { - case 0: // Red is the dominant color - return new Color(value, t, p, alpha); - case 1: // Green is the dominant color - return new Color(q, value, p, alpha); - case 2: - return new Color(p, value, t, alpha); - case 3: // Blue is the dominant color - return new Color(p, q, value, alpha); - case 4: - return new Color(t, p, value, alpha); - default: // (5) Red is the dominant color - return new Color(value, p, q, alpha); - } - } - - public Color blend(Color over) - { - Color res; - - float sa = 1.0f - over.a; - res.a = a * sa + over.a; - - if (res.a == 0) - { - return new Color(0, 0, 0, 0); - } - else - { - res.r = (r * a * sa + over.r * over.a) / res.a; - res.g = (g * a * sa + over.g * over.a) / res.a; - res.b = (b * a * sa + over.b * over.a) / res.a; - } - - return res; - } - - public Color contrasted() - { - return new Color( - (r + 0.5f) % 1.0f, - (g + 0.5f) % 1.0f, - (b + 0.5f) % 1.0f - ); - } - - public float gray() - { - return (r + g + b) / 3.0f; - } - - public Color inverted() - { - return new Color( - 1.0f - r, - 1.0f - g, - 1.0f - b - ); - } - - public Color linear_interpolate(Color b, float t) - { - Color res = this; - - res.r += (t * (b.r - this.r)); - res.g += (t * (b.g - this.g)); - res.b += (t * (b.b - this.b)); - res.a += (t * (b.a - this.a)); - - return res; - } - - public int to_32() - { - int c = (byte)(a * 255); - c <<= 8; - c |= (byte)(r * 255); - c <<= 8; - c |= (byte)(g * 255); - c <<= 8; - c |= (byte)(b * 255); - - return c; - } - - public int to_ARGB32() - { - int c = (byte)(a * 255); - c <<= 8; - c |= (byte)(r * 255); - c <<= 8; - c |= (byte)(g * 255); - c <<= 8; - c |= (byte)(b * 255); - - return c; - } - - public string to_html(bool include_alpha = true) - { - String txt = string.Empty; - - txt += _to_hex(r); - txt += _to_hex(g); - txt += _to_hex(b); - - if (include_alpha) - txt = _to_hex(a) + txt; - - return txt; - } - - public Color(float r, float g, float b, float a = 1.0f) - { - this.r = r; - this.g = g; - this.b = b; - this.a = a; - } - - public Color(int rgba) - { - this.a = (rgba & 0xFF) / 255.0f; - rgba >>= 8; - this.b = (rgba & 0xFF) / 255.0f; - rgba >>= 8; - this.g = (rgba & 0xFF) / 255.0f; - rgba >>= 8; - this.r = (rgba & 0xFF) / 255.0f; - } - - private static float _parse_col(string str, int ofs) - { - int ig = 0; - - for (int i = 0; i < 2; i++) - { - int c = str[i + ofs]; - int v = 0; - - if (c >= '0' && c <= '9') - { - v = c - '0'; - } - else if (c >= 'a' && c <= 'f') - { - v = c - 'a'; - v += 10; - } - else if (c >= 'A' && c <= 'F') - { - v = c - 'A'; - v += 10; - } - else - { - return -1; - } - - if (i == 0) - ig += v * 16; - else - ig += v; - } - - return ig; - } - - private String _to_hex(float val) - { - int v = (int)Mathf.clamp(val * 255.0f, 0, 255); - - string ret = string.Empty; - - for (int i = 0; i < 2; i++) - { - char[] c = { (char)0, (char)0 }; - int lv = v & 0xF; - - if (lv < 10) - c[0] = (char)('0' + lv); - else - c[0] = (char)('a' + lv - 10); - - v >>= 4; - ret = c + ret; - } - - return ret; - } - - internal static bool html_is_valid(string color) - { - if (color.Length == 0) - return false; - - if (color[0] == '#') - color = color.Substring(1, color.Length - 1); - - bool alpha = false; - - if (color.Length == 8) - alpha = true; - else if (color.Length == 6) - alpha = false; - else - return false; - - if (alpha) - { - if ((int)_parse_col(color, 0) < 0) - return false; - } - - int from = alpha ? 2 : 0; - - if ((int)_parse_col(color, from + 0) < 0) - return false; - if ((int)_parse_col(color, from + 2) < 0) - return false; - if ((int)_parse_col(color, from + 4) < 0) - return false; - - return true; - } - - public static Color Color8(byte r8, byte g8, byte b8, byte a8) - { - return new Color((float)r8 / 255f, (float)g8 / 255f, (float)b8 / 255f, (float)a8 / 255f); - } - - public Color(string rgba) - { - if (rgba.Length == 0) - { - r = 0f; - g = 0f; - b = 0f; - a = 1.0f; - return; - } - - if (rgba[0] == '#') - rgba = rgba.Substring(1); - - bool alpha = false; - - if (rgba.Length == 8) - { - alpha = true; - } - else if (rgba.Length == 6) - { - alpha = false; - } - else - { - throw new ArgumentOutOfRangeException("Invalid color code. Length is " + rgba.Length + " but a length of 6 or 8 is expected: " + rgba); - } - - if (alpha) - { - a = _parse_col(rgba, 0); - - if (a < 0) - throw new ArgumentOutOfRangeException("Invalid color code. Alpha is " + a + " but zero or greater is expected: " + rgba); - } - else - { - a = 1.0f; - } - - int from = alpha ? 2 : 0; - - r = _parse_col(rgba, from + 0); - - if (r < 0) - throw new ArgumentOutOfRangeException("Invalid color code. Red is " + r + " but zero or greater is expected: " + rgba); - - g = _parse_col(rgba, from + 2); - - if (g < 0) - throw new ArgumentOutOfRangeException("Invalid color code. Green is " + g + " but zero or greater is expected: " + rgba); - - b = _parse_col(rgba, from + 4); - - if (b < 0) - throw new ArgumentOutOfRangeException("Invalid color code. Blue is " + b + " but zero or greater is expected: " + rgba); - } - - public static bool operator ==(Color left, Color right) - { - return left.Equals(right); - } - - public static bool operator !=(Color left, Color right) - { - return !left.Equals(right); - } - - public static bool operator <(Color left, Color right) - { - if (left.r == right.r) - { - if (left.g == right.g) - { - if (left.b == right.b) - return (left.a < right.a); - else - return (left.b < right.b); - } - else - { - return left.g < right.g; - } - } - - return left.r < right.r; - } - - public static bool operator >(Color left, Color right) - { - if (left.r == right.r) - { - if (left.g == right.g) - { - if (left.b == right.b) - return (left.a > right.a); - else - return (left.b > right.b); - } - else - { - return left.g > right.g; - } - } - - return left.r > right.r; - } - - public override bool Equals(object obj) - { - if (obj is Color) - { - return Equals((Color)obj); - } - - return false; - } - - public bool Equals(Color other) - { - return r == other.r && g == other.g && b == other.b && a == other.a; - } - - public override int GetHashCode() - { - return r.GetHashCode() ^ g.GetHashCode() ^ b.GetHashCode() ^ a.GetHashCode(); - } - - public override string ToString() - { - return String.Format("{0},{1},{2},{3}", new object[] - { - this.r.ToString(), - this.g.ToString(), - this.b.ToString(), - this.a.ToString() - }); - } - - public string ToString(string format) - { - return String.Format("{0},{1},{2},{3}", new object[] - { - this.r.ToString(format), - this.g.ToString(format), - this.b.ToString(format), - this.a.ToString(format) - }); - } - } -} +using System; + +namespace Godot +{ + public struct Color : IEquatable + { + public float r; + public float g; + public float b; + public float a; + + public int r8 + { + get + { + return (int)(r * 255.0f); + } + } + + public int g8 + { + get + { + return (int)(g * 255.0f); + } + } + + public int b8 + { + get + { + return (int)(b * 255.0f); + } + } + + public int a8 + { + get + { + return (int)(a * 255.0f); + } + } + + public float h + { + get + { + float max = Mathf.max(r, Mathf.max(g, b)); + float min = Mathf.min(r, Mathf.min(g, b)); + + float delta = max - min; + + if (delta == 0) + return 0; + + float h; + + if (r == max) + h = (g - b) / delta; // Between yellow & magenta + else if (g == max) + h = 2 + (b - r) / delta; // Between cyan & yellow + else + h = 4 + (r - g) / delta; // Between magenta & cyan + + h /= 6.0f; + + if (h < 0) + h += 1.0f; + + return h; + } + set + { + this = from_hsv(value, s, v); + } + } + + public float s + { + get + { + float max = Mathf.max(r, Mathf.max(g, b)); + float min = Mathf.min(r, Mathf.min(g, b)); + + float delta = max - min; + + return max != 0 ? delta / max : 0; + } + set + { + this = from_hsv(h, value, v); + } + } + + public float v + { + get + { + return Mathf.max(r, Mathf.max(g, b)); + } + set + { + this = from_hsv(h, s, value); + } + } + + private static readonly Color black = new Color(0f, 0f, 0f, 1.0f); + + public Color Black + { + get + { + return black; + } + } + + public float this [int index] + { + get + { + switch (index) + { + case 0: + return r; + case 1: + return g; + case 2: + return b; + case 3: + return a; + default: + throw new IndexOutOfRangeException(); + } + } + set + { + switch (index) + { + case 0: + r = value; + return; + case 1: + g = value; + return; + case 2: + b = value; + return; + case 3: + a = value; + return; + default: + throw new IndexOutOfRangeException(); + } + } + } + + public static void to_hsv(Color color, out float hue, out float saturation, out float value) + { + int max = Mathf.max(color.r8, Mathf.max(color.g8, color.b8)); + int min = Mathf.min(color.r8, Mathf.min(color.g8, color.b8)); + + float delta = max - min; + + if (delta == 0) + { + hue = 0; + } + else + { + if (color.r == max) + hue = (color.g - color.b) / delta; // Between yellow & magenta + else if (color.g == max) + hue = 2 + (color.b - color.r) / delta; // Between cyan & yellow + else + hue = 4 + (color.r - color.g) / delta; // Between magenta & cyan + + hue /= 6.0f; + + if (hue < 0) + hue += 1.0f; + } + + saturation = (max == 0) ? 0 : 1f - (1f * min / max); + value = max / 255f; + } + + public static Color from_hsv(float hue, float saturation, float value, float alpha = 1.0f) + { + if (saturation == 0) + { + // acp_hromatic (grey) + return new Color(value, value, value, alpha); + } + + int i; + float f, p, q, t; + + hue *= 6.0f; + hue %= 6f; + i = (int)hue; + + f = hue - i; + p = value * (1 - saturation); + q = value * (1 - saturation * f); + t = value * (1 - saturation * (1 - f)); + + switch (i) + { + case 0: // Red is the dominant color + return new Color(value, t, p, alpha); + case 1: // Green is the dominant color + return new Color(q, value, p, alpha); + case 2: + return new Color(p, value, t, alpha); + case 3: // Blue is the dominant color + return new Color(p, q, value, alpha); + case 4: + return new Color(t, p, value, alpha); + default: // (5) Red is the dominant color + return new Color(value, p, q, alpha); + } + } + + public Color blend(Color over) + { + Color res; + + float sa = 1.0f - over.a; + res.a = a * sa + over.a; + + if (res.a == 0) + { + return new Color(0, 0, 0, 0); + } + else + { + res.r = (r * a * sa + over.r * over.a) / res.a; + res.g = (g * a * sa + over.g * over.a) / res.a; + res.b = (b * a * sa + over.b * over.a) / res.a; + } + + return res; + } + + public Color contrasted() + { + return new Color( + (r + 0.5f) % 1.0f, + (g + 0.5f) % 1.0f, + (b + 0.5f) % 1.0f + ); + } + + public float gray() + { + return (r + g + b) / 3.0f; + } + + public Color inverted() + { + return new Color( + 1.0f - r, + 1.0f - g, + 1.0f - b + ); + } + + public Color linear_interpolate(Color b, float t) + { + Color res = this; + + res.r += (t * (b.r - this.r)); + res.g += (t * (b.g - this.g)); + res.b += (t * (b.b - this.b)); + res.a += (t * (b.a - this.a)); + + return res; + } + + public int to_32() + { + int c = (byte)(a * 255); + c <<= 8; + c |= (byte)(r * 255); + c <<= 8; + c |= (byte)(g * 255); + c <<= 8; + c |= (byte)(b * 255); + + return c; + } + + public int to_ARGB32() + { + int c = (byte)(a * 255); + c <<= 8; + c |= (byte)(r * 255); + c <<= 8; + c |= (byte)(g * 255); + c <<= 8; + c |= (byte)(b * 255); + + return c; + } + + public string to_html(bool include_alpha = true) + { + String txt = string.Empty; + + txt += _to_hex(r); + txt += _to_hex(g); + txt += _to_hex(b); + + if (include_alpha) + txt = _to_hex(a) + txt; + + return txt; + } + + public Color(float r, float g, float b, float a = 1.0f) + { + this.r = r; + this.g = g; + this.b = b; + this.a = a; + } + + public Color(int rgba) + { + this.a = (rgba & 0xFF) / 255.0f; + rgba >>= 8; + this.b = (rgba & 0xFF) / 255.0f; + rgba >>= 8; + this.g = (rgba & 0xFF) / 255.0f; + rgba >>= 8; + this.r = (rgba & 0xFF) / 255.0f; + } + + private static float _parse_col(string str, int ofs) + { + int ig = 0; + + for (int i = 0; i < 2; i++) + { + int c = str[i + ofs]; + int v = 0; + + if (c >= '0' && c <= '9') + { + v = c - '0'; + } + else if (c >= 'a' && c <= 'f') + { + v = c - 'a'; + v += 10; + } + else if (c >= 'A' && c <= 'F') + { + v = c - 'A'; + v += 10; + } + else + { + return -1; + } + + if (i == 0) + ig += v * 16; + else + ig += v; + } + + return ig; + } + + private String _to_hex(float val) + { + int v = (int)Mathf.clamp(val * 255.0f, 0, 255); + + string ret = string.Empty; + + for (int i = 0; i < 2; i++) + { + char[] c = { (char)0, (char)0 }; + int lv = v & 0xF; + + if (lv < 10) + c[0] = (char)('0' + lv); + else + c[0] = (char)('a' + lv - 10); + + v >>= 4; + ret = c + ret; + } + + return ret; + } + + internal static bool html_is_valid(string color) + { + if (color.Length == 0) + return false; + + if (color[0] == '#') + color = color.Substring(1, color.Length - 1); + + bool alpha = false; + + if (color.Length == 8) + alpha = true; + else if (color.Length == 6) + alpha = false; + else + return false; + + if (alpha) + { + if ((int)_parse_col(color, 0) < 0) + return false; + } + + int from = alpha ? 2 : 0; + + if ((int)_parse_col(color, from + 0) < 0) + return false; + if ((int)_parse_col(color, from + 2) < 0) + return false; + if ((int)_parse_col(color, from + 4) < 0) + return false; + + return true; + } + + public static Color Color8(byte r8, byte g8, byte b8, byte a8) + { + return new Color((float)r8 / 255f, (float)g8 / 255f, (float)b8 / 255f, (float)a8 / 255f); + } + + public Color(string rgba) + { + if (rgba.Length == 0) + { + r = 0f; + g = 0f; + b = 0f; + a = 1.0f; + return; + } + + if (rgba[0] == '#') + rgba = rgba.Substring(1); + + bool alpha = false; + + if (rgba.Length == 8) + { + alpha = true; + } + else if (rgba.Length == 6) + { + alpha = false; + } + else + { + throw new ArgumentOutOfRangeException("Invalid color code. Length is " + rgba.Length + " but a length of 6 or 8 is expected: " + rgba); + } + + if (alpha) + { + a = _parse_col(rgba, 0); + + if (a < 0) + throw new ArgumentOutOfRangeException("Invalid color code. Alpha is " + a + " but zero or greater is expected: " + rgba); + } + else + { + a = 1.0f; + } + + int from = alpha ? 2 : 0; + + r = _parse_col(rgba, from + 0); + + if (r < 0) + throw new ArgumentOutOfRangeException("Invalid color code. Red is " + r + " but zero or greater is expected: " + rgba); + + g = _parse_col(rgba, from + 2); + + if (g < 0) + throw new ArgumentOutOfRangeException("Invalid color code. Green is " + g + " but zero or greater is expected: " + rgba); + + b = _parse_col(rgba, from + 4); + + if (b < 0) + throw new ArgumentOutOfRangeException("Invalid color code. Blue is " + b + " but zero or greater is expected: " + rgba); + } + + public static bool operator ==(Color left, Color right) + { + return left.Equals(right); + } + + public static bool operator !=(Color left, Color right) + { + return !left.Equals(right); + } + + public static bool operator <(Color left, Color right) + { + if (left.r == right.r) + { + if (left.g == right.g) + { + if (left.b == right.b) + return (left.a < right.a); + else + return (left.b < right.b); + } + else + { + return left.g < right.g; + } + } + + return left.r < right.r; + } + + public static bool operator >(Color left, Color right) + { + if (left.r == right.r) + { + if (left.g == right.g) + { + if (left.b == right.b) + return (left.a > right.a); + else + return (left.b > right.b); + } + else + { + return left.g > right.g; + } + } + + return left.r > right.r; + } + + public override bool Equals(object obj) + { + if (obj is Color) + { + return Equals((Color)obj); + } + + return false; + } + + public bool Equals(Color other) + { + return r == other.r && g == other.g && b == other.b && a == other.a; + } + + public override int GetHashCode() + { + return r.GetHashCode() ^ g.GetHashCode() ^ b.GetHashCode() ^ a.GetHashCode(); + } + + public override string ToString() + { + return String.Format("{0},{1},{2},{3}", new object[] + { + this.r.ToString(), + this.g.ToString(), + this.b.ToString(), + this.a.ToString() + }); + } + + public string ToString(string format) + { + return String.Format("{0},{1},{2},{3}", new object[] + { + this.r.ToString(format), + this.g.ToString(format), + this.b.ToString(format), + this.a.ToString(format) + }); + } + } +} diff --git a/modules/mono/glue/cs_files/ExportAttribute.cs b/modules/mono/glue/cs_files/ExportAttribute.cs index a4e7d447dd..dce9cc59a0 100644 --- a/modules/mono/glue/cs_files/ExportAttribute.cs +++ b/modules/mono/glue/cs_files/ExportAttribute.cs @@ -1,4 +1,4 @@ -using System; +using System; namespace Godot { diff --git a/modules/mono/glue/cs_files/MarshalUtils.cs b/modules/mono/glue/cs_files/MarshalUtils.cs index 5d40111339..2bdfb95c51 100644 --- a/modules/mono/glue/cs_files/MarshalUtils.cs +++ b/modules/mono/glue/cs_files/MarshalUtils.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; namespace Godot diff --git a/modules/mono/glue/cs_files/Plane.cs b/modules/mono/glue/cs_files/Plane.cs index ada6e465ac..37f70aca1e 100644 --- a/modules/mono/glue/cs_files/Plane.cs +++ b/modules/mono/glue/cs_files/Plane.cs @@ -1,209 +1,209 @@ -using System; - -namespace Godot -{ - public struct Plane : IEquatable - { - Vector3 normal; - - public float x - { - get - { - return normal.x; - } - set - { - normal.x = value; - } - } - - public float y - { - get - { - return normal.y; - } - set - { - normal.y = value; - } - } - - public float z - { - get - { - return normal.z; - } - set - { - normal.z = value; - } - } - - float d; - - public Vector3 Center - { - get - { - return normal * d; - } - } - - public float distance_to(Vector3 point) - { - return normal.dot(point) - d; - } - - public Vector3 get_any_point() - { - return normal * d; - } - - public bool has_point(Vector3 point, float epsilon = Mathf.Epsilon) - { - float dist = normal.dot(point) - d; - return Mathf.abs(dist) <= epsilon; - } - - public Vector3 intersect_3(Plane b, Plane c) - { - float denom = normal.cross(b.normal).dot(c.normal); - - if (Mathf.abs(denom) <= Mathf.Epsilon) - return new Vector3(); - - Vector3 result = (b.normal.cross(c.normal) * this.d) + - (c.normal.cross(normal) * b.d) + - (normal.cross(b.normal) * c.d); - - return result / denom; - } - - public Vector3 intersect_ray(Vector3 from, Vector3 dir) - { - float den = normal.dot(dir); - - if (Mathf.abs(den) <= Mathf.Epsilon) - return new Vector3(); - - float dist = (normal.dot(from) - d) / den; - - // This is a ray, before the emiting pos (from) does not exist - if (dist > Mathf.Epsilon) - return new Vector3(); - - return from + dir * -dist; - } - - public Vector3 intersect_segment(Vector3 begin, Vector3 end) - { - Vector3 segment = begin - end; - float den = normal.dot(segment); - - if (Mathf.abs(den) <= Mathf.Epsilon) - return new Vector3(); - - float dist = (normal.dot(begin) - d) / den; - - if (dist < -Mathf.Epsilon || dist > (1.0f + Mathf.Epsilon)) - return new Vector3(); - - return begin + segment * -dist; - } - - public bool is_point_over(Vector3 point) - { - return normal.dot(point) > d; - } - - public Plane normalized() - { - float len = normal.length(); - - if (len == 0) - return new Plane(0, 0, 0, 0); - - return new Plane(normal / len, d / len); - } - - public Vector3 project(Vector3 point) - { - return point - normal * distance_to(point); - } - - public Plane(float a, float b, float c, float d) - { - normal = new Vector3(a, b, c); - this.d = d; - } - - public Plane(Vector3 normal, float d) - { - this.normal = normal; - this.d = d; - } - - public Plane(Vector3 v1, Vector3 v2, Vector3 v3) - { - normal = (v1 - v3).cross(v1 - v2); - normal.normalize(); - d = normal.dot(v1); - } - - public static Plane operator -(Plane plane) - { - return new Plane(-plane.normal, -plane.d); - } - - public static bool operator ==(Plane left, Plane right) - { - return left.Equals(right); - } - - public static bool operator !=(Plane left, Plane right) - { - return !left.Equals(right); - } - - public override bool Equals(object obj) - { - if (obj is Plane) - { - return Equals((Plane)obj); - } - - return false; - } - - public bool Equals(Plane other) - { - return normal == other.normal && d == other.d; - } - - public override int GetHashCode() - { - return normal.GetHashCode() ^ d.GetHashCode(); - } - - public override string ToString() - { - return String.Format("({0}, {1})", new object[] - { - this.normal.ToString(), - this.d.ToString() - }); - } - - public string ToString(string format) - { - return String.Format("({0}, {1})", new object[] - { - this.normal.ToString(format), - this.d.ToString(format) - }); - } - } -} +using System; + +namespace Godot +{ + public struct Plane : IEquatable + { + Vector3 normal; + + public float x + { + get + { + return normal.x; + } + set + { + normal.x = value; + } + } + + public float y + { + get + { + return normal.y; + } + set + { + normal.y = value; + } + } + + public float z + { + get + { + return normal.z; + } + set + { + normal.z = value; + } + } + + float d; + + public Vector3 Center + { + get + { + return normal * d; + } + } + + public float distance_to(Vector3 point) + { + return normal.dot(point) - d; + } + + public Vector3 get_any_point() + { + return normal * d; + } + + public bool has_point(Vector3 point, float epsilon = Mathf.Epsilon) + { + float dist = normal.dot(point) - d; + return Mathf.abs(dist) <= epsilon; + } + + public Vector3 intersect_3(Plane b, Plane c) + { + float denom = normal.cross(b.normal).dot(c.normal); + + if (Mathf.abs(denom) <= Mathf.Epsilon) + return new Vector3(); + + Vector3 result = (b.normal.cross(c.normal) * this.d) + + (c.normal.cross(normal) * b.d) + + (normal.cross(b.normal) * c.d); + + return result / denom; + } + + public Vector3 intersect_ray(Vector3 from, Vector3 dir) + { + float den = normal.dot(dir); + + if (Mathf.abs(den) <= Mathf.Epsilon) + return new Vector3(); + + float dist = (normal.dot(from) - d) / den; + + // This is a ray, before the emiting pos (from) does not exist + if (dist > Mathf.Epsilon) + return new Vector3(); + + return from + dir * -dist; + } + + public Vector3 intersect_segment(Vector3 begin, Vector3 end) + { + Vector3 segment = begin - end; + float den = normal.dot(segment); + + if (Mathf.abs(den) <= Mathf.Epsilon) + return new Vector3(); + + float dist = (normal.dot(begin) - d) / den; + + if (dist < -Mathf.Epsilon || dist > (1.0f + Mathf.Epsilon)) + return new Vector3(); + + return begin + segment * -dist; + } + + public bool is_point_over(Vector3 point) + { + return normal.dot(point) > d; + } + + public Plane normalized() + { + float len = normal.length(); + + if (len == 0) + return new Plane(0, 0, 0, 0); + + return new Plane(normal / len, d / len); + } + + public Vector3 project(Vector3 point) + { + return point - normal * distance_to(point); + } + + public Plane(float a, float b, float c, float d) + { + normal = new Vector3(a, b, c); + this.d = d; + } + + public Plane(Vector3 normal, float d) + { + this.normal = normal; + this.d = d; + } + + public Plane(Vector3 v1, Vector3 v2, Vector3 v3) + { + normal = (v1 - v3).cross(v1 - v2); + normal.normalize(); + d = normal.dot(v1); + } + + public static Plane operator -(Plane plane) + { + return new Plane(-plane.normal, -plane.d); + } + + public static bool operator ==(Plane left, Plane right) + { + return left.Equals(right); + } + + public static bool operator !=(Plane left, Plane right) + { + return !left.Equals(right); + } + + public override bool Equals(object obj) + { + if (obj is Plane) + { + return Equals((Plane)obj); + } + + return false; + } + + public bool Equals(Plane other) + { + return normal == other.normal && d == other.d; + } + + public override int GetHashCode() + { + return normal.GetHashCode() ^ d.GetHashCode(); + } + + public override string ToString() + { + return String.Format("({0}, {1})", new object[] + { + this.normal.ToString(), + this.d.ToString() + }); + } + + public string ToString(string format) + { + return String.Format("({0}, {1})", new object[] + { + this.normal.ToString(format), + this.d.ToString(format) + }); + } + } +} diff --git a/modules/mono/glue/cs_files/Rect3.cs b/modules/mono/glue/cs_files/Rect3.cs index 0d25de1ec6..617d33e7fd 100644 --- a/modules/mono/glue/cs_files/Rect3.cs +++ b/modules/mono/glue/cs_files/Rect3.cs @@ -1,477 +1,477 @@ -using System; - -// file: core/math/rect3.h -// commit: 7ad14e7a3e6f87ddc450f7e34621eb5200808451 -// file: core/math/rect3.cpp -// commit: bd282ff43f23fe845f29a3e25c8efc01bd65ffb0 -// file: core/variant_call.cpp -// commit: 5ad9be4c24e9d7dc5672fdc42cea896622fe5685 - -namespace Godot -{ - public struct Rect3 : IEquatable - { - private Vector3 position; - private Vector3 size; - - public Vector3 Position - { - get - { - return position; - } - } - - public Vector3 Size - { - get - { - return size; - } - } - - public Vector3 End - { - get - { - return position + size; - } - } - - public bool encloses(Rect3 with) - { - Vector3 src_min = position; - Vector3 src_max = position + size; - Vector3 dst_min = with.position; - Vector3 dst_max = with.position + with.size; - - return ((src_min.x <= dst_min.x) && - (src_max.x > dst_max.x) && - (src_min.y <= dst_min.y) && - (src_max.y > dst_max.y) && - (src_min.z <= dst_min.z) && - (src_max.z > dst_max.z)); - } - - public Rect3 expand(Vector3 to_point) - { - Vector3 begin = position; - Vector3 end = position + size; - - if (to_point.x < begin.x) - begin.x = to_point.x; - if (to_point.y < begin.y) - begin.y = to_point.y; - if (to_point.z < begin.z) - begin.z = to_point.z; - - if (to_point.x > end.x) - end.x = to_point.x; - if (to_point.y > end.y) - end.y = to_point.y; - if (to_point.z > end.z) - end.z = to_point.z; - - return new Rect3(begin, end - begin); - } - - public float get_area() - { - return size.x * size.y * size.z; - } - - public Vector3 get_endpoint(int idx) - { - switch (idx) - { - case 0: - return new Vector3(position.x, position.y, position.z); - case 1: - return new Vector3(position.x, position.y, position.z + size.z); - case 2: - return new Vector3(position.x, position.y + size.y, position.z); - case 3: - return new Vector3(position.x, position.y + size.y, position.z + size.z); - case 4: - return new Vector3(position.x + size.x, position.y, position.z); - case 5: - return new Vector3(position.x + size.x, position.y, position.z + size.z); - case 6: - return new Vector3(position.x + size.x, position.y + size.y, position.z); - case 7: - return new Vector3(position.x + size.x, position.y + size.y, position.z + size.z); - default: - throw new ArgumentOutOfRangeException(nameof(idx), String.Format("Index is {0}, but a value from 0 to 7 is expected.", idx)); - } - } - - public Vector3 get_longest_axis() - { - Vector3 axis = new Vector3(1f, 0f, 0f); - float max_size = size.x; - - if (size.y > max_size) - { - axis = new Vector3(0f, 1f, 0f); - max_size = size.y; - } - - if (size.z > max_size) - { - axis = new Vector3(0f, 0f, 1f); - max_size = size.z; - } - - return axis; - } - - public Vector3.Axis get_longest_axis_index() - { - Vector3.Axis axis = Vector3.Axis.X; - float max_size = size.x; - - if (size.y > max_size) - { - axis = Vector3.Axis.Y; - max_size = size.y; - } - - if (size.z > max_size) - { - axis = Vector3.Axis.Z; - max_size = size.z; - } - - return axis; - } - - public float get_longest_axis_size() - { - float max_size = size.x; - - if (size.y > max_size) - max_size = size.y; - - if (size.z > max_size) - max_size = size.z; - - return max_size; - } - - public Vector3 get_shortest_axis() - { - Vector3 axis = new Vector3(1f, 0f, 0f); - float max_size = size.x; - - if (size.y < max_size) - { - axis = new Vector3(0f, 1f, 0f); - max_size = size.y; - } - - if (size.z < max_size) - { - axis = new Vector3(0f, 0f, 1f); - max_size = size.z; - } - - return axis; - } - - public Vector3.Axis get_shortest_axis_index() - { - Vector3.Axis axis = Vector3.Axis.X; - float max_size = size.x; - - if (size.y < max_size) - { - axis = Vector3.Axis.Y; - max_size = size.y; - } - - if (size.z < max_size) - { - axis = Vector3.Axis.Z; - max_size = size.z; - } - - return axis; - } - - public float get_shortest_axis_size() - { - float max_size = size.x; - - if (size.y < max_size) - max_size = size.y; - - if (size.z < max_size) - max_size = size.z; - - return max_size; - } - - public Vector3 get_support(Vector3 dir) - { - Vector3 half_extents = size * 0.5f; - Vector3 ofs = position + half_extents; - - return ofs + new Vector3( - (dir.x > 0f) ? -half_extents.x : half_extents.x, - (dir.y > 0f) ? -half_extents.y : half_extents.y, - (dir.z > 0f) ? -half_extents.z : half_extents.z); - } - - public Rect3 grow(float by) - { - Rect3 res = this; - - res.position.x -= by; - res.position.y -= by; - res.position.z -= by; - res.size.x += 2.0f * by; - res.size.y += 2.0f * by; - res.size.z += 2.0f * by; - - return res; - } - - public bool has_no_area() - { - return size.x <= 0f || size.y <= 0f || size.z <= 0f; - } - - public bool has_no_surface() - { - return size.x <= 0f && size.y <= 0f && size.z <= 0f; - } - - public bool has_point(Vector3 point) - { - if (point.x < position.x) - return false; - if (point.y < position.y) - return false; - if (point.z < position.z) - return false; - if (point.x > position.x + size.x) - return false; - if (point.y > position.y + size.y) - return false; - if (point.z > position.z + size.z) - return false; - - return true; - } - - public Rect3 intersection(Rect3 with) - { - Vector3 src_min = position; - Vector3 src_max = position + size; - Vector3 dst_min = with.position; - Vector3 dst_max = with.position + with.size; - - Vector3 min, max; - - if (src_min.x > dst_max.x || src_max.x < dst_min.x) - { - return new Rect3(); - } - else - { - min.x = (src_min.x > dst_min.x) ? src_min.x : dst_min.x; - max.x = (src_max.x < dst_max.x) ? src_max.x : dst_max.x; - } - - if (src_min.y > dst_max.y || src_max.y < dst_min.y) - { - return new Rect3(); - } - else - { - min.y = (src_min.y > dst_min.y) ? src_min.y : dst_min.y; - max.y = (src_max.y < dst_max.y) ? src_max.y : dst_max.y; - } - - if (src_min.z > dst_max.z || src_max.z < dst_min.z) - { - return new Rect3(); - } - else - { - min.z = (src_min.z > dst_min.z) ? src_min.z : dst_min.z; - max.z = (src_max.z < dst_max.z) ? src_max.z : dst_max.z; - } - - return new Rect3(min, max - min); - } - - public bool intersects(Rect3 with) - { - if (position.x >= (with.position.x + with.size.x)) - return false; - if ((position.x + size.x) <= with.position.x) - return false; - if (position.y >= (with.position.y + with.size.y)) - return false; - if ((position.y + size.y) <= with.position.y) - return false; - if (position.z >= (with.position.z + with.size.z)) - return false; - if ((position.z + size.z) <= with.position.z) - return false; - - return true; - } - - public bool intersects_plane(Plane plane) - { - Vector3[] points = - { - new Vector3(position.x, position.y, position.z), - new Vector3(position.x, position.y, position.z + size.z), - new Vector3(position.x, position.y + size.y, position.z), - new Vector3(position.x, position.y + size.y, position.z + size.z), - new Vector3(position.x + size.x, position.y, position.z), - new Vector3(position.x + size.x, position.y, position.z + size.z), - new Vector3(position.x + size.x, position.y + size.y, position.z), - new Vector3(position.x + size.x, position.y + size.y, position.z + size.z), - }; - - bool over = false; - bool under = false; - - for (int i = 0; i < 8; i++) - { - if (plane.distance_to(points[i]) > 0) - over = true; - else - under = true; - } - - return under && over; - } - - public bool intersects_segment(Vector3 from, Vector3 to) - { - float min = 0f; - float max = 1f; - - for (int i = 0; i < 3; i++) - { - float seg_from = from[i]; - float seg_to = to[i]; - float box_begin = position[i]; - float box_end = box_begin + size[i]; - float cmin, cmax; - - if (seg_from < seg_to) - { - if (seg_from > box_end || seg_to < box_begin) - return false; - - float length = seg_to - seg_from; - cmin = seg_from < box_begin ? (box_begin - seg_from) / length : 0f; - cmax = seg_to > box_end ? (box_end - seg_from) / length : 1f; - } - else - { - if (seg_to > box_end || seg_from < box_begin) - return false; - - float length = seg_to - seg_from; - cmin = seg_from > box_end ? (box_end - seg_from) / length : 0f; - cmax = seg_to < box_begin ? (box_begin - seg_from) / length : 1f; - } - - if (cmin > min) - { - min = cmin; - } - - if (cmax < max) - max = cmax; - if (max < min) - return false; - } - - return true; - } - - public Rect3 merge(Rect3 with) - { - Vector3 beg_1 = position; - Vector3 beg_2 = with.position; - Vector3 end_1 = new Vector3(size.x, size.y, size.z) + beg_1; - Vector3 end_2 = new Vector3(with.size.x, with.size.y, with.size.z) + beg_2; - - Vector3 min = new Vector3( - (beg_1.x < beg_2.x) ? beg_1.x : beg_2.x, - (beg_1.y < beg_2.y) ? beg_1.y : beg_2.y, - (beg_1.z < beg_2.z) ? beg_1.z : beg_2.z - ); - - Vector3 max = new Vector3( - (end_1.x > end_2.x) ? end_1.x : end_2.x, - (end_1.y > end_2.y) ? end_1.y : end_2.y, - (end_1.z > end_2.z) ? end_1.z : end_2.z - ); - - return new Rect3(min, max - min); - } - - public Rect3(Vector3 position, Vector3 size) - { - this.position = position; - this.size = size; - } - - public static bool operator ==(Rect3 left, Rect3 right) - { - return left.Equals(right); - } - - public static bool operator !=(Rect3 left, Rect3 right) - { - return !left.Equals(right); - } - - public override bool Equals(object obj) - { - if (obj is Rect3) - { - return Equals((Rect3)obj); - } - - return false; - } - - public bool Equals(Rect3 other) - { - return position == other.position && size == other.size; - } - - public override int GetHashCode() - { - return position.GetHashCode() ^ size.GetHashCode(); - } - - public override string ToString() - { - return String.Format("{0} - {1}", new object[] - { - this.position.ToString(), - this.size.ToString() - }); - } - - public string ToString(string format) - { - return String.Format("{0} - {1}", new object[] - { - this.position.ToString(format), - this.size.ToString(format) - }); - } - } -} +using System; + +// file: core/math/rect3.h +// commit: 7ad14e7a3e6f87ddc450f7e34621eb5200808451 +// file: core/math/rect3.cpp +// commit: bd282ff43f23fe845f29a3e25c8efc01bd65ffb0 +// file: core/variant_call.cpp +// commit: 5ad9be4c24e9d7dc5672fdc42cea896622fe5685 + +namespace Godot +{ + public struct Rect3 : IEquatable + { + private Vector3 position; + private Vector3 size; + + public Vector3 Position + { + get + { + return position; + } + } + + public Vector3 Size + { + get + { + return size; + } + } + + public Vector3 End + { + get + { + return position + size; + } + } + + public bool encloses(Rect3 with) + { + Vector3 src_min = position; + Vector3 src_max = position + size; + Vector3 dst_min = with.position; + Vector3 dst_max = with.position + with.size; + + return ((src_min.x <= dst_min.x) && + (src_max.x > dst_max.x) && + (src_min.y <= dst_min.y) && + (src_max.y > dst_max.y) && + (src_min.z <= dst_min.z) && + (src_max.z > dst_max.z)); + } + + public Rect3 expand(Vector3 to_point) + { + Vector3 begin = position; + Vector3 end = position + size; + + if (to_point.x < begin.x) + begin.x = to_point.x; + if (to_point.y < begin.y) + begin.y = to_point.y; + if (to_point.z < begin.z) + begin.z = to_point.z; + + if (to_point.x > end.x) + end.x = to_point.x; + if (to_point.y > end.y) + end.y = to_point.y; + if (to_point.z > end.z) + end.z = to_point.z; + + return new Rect3(begin, end - begin); + } + + public float get_area() + { + return size.x * size.y * size.z; + } + + public Vector3 get_endpoint(int idx) + { + switch (idx) + { + case 0: + return new Vector3(position.x, position.y, position.z); + case 1: + return new Vector3(position.x, position.y, position.z + size.z); + case 2: + return new Vector3(position.x, position.y + size.y, position.z); + case 3: + return new Vector3(position.x, position.y + size.y, position.z + size.z); + case 4: + return new Vector3(position.x + size.x, position.y, position.z); + case 5: + return new Vector3(position.x + size.x, position.y, position.z + size.z); + case 6: + return new Vector3(position.x + size.x, position.y + size.y, position.z); + case 7: + return new Vector3(position.x + size.x, position.y + size.y, position.z + size.z); + default: + throw new ArgumentOutOfRangeException(nameof(idx), String.Format("Index is {0}, but a value from 0 to 7 is expected.", idx)); + } + } + + public Vector3 get_longest_axis() + { + Vector3 axis = new Vector3(1f, 0f, 0f); + float max_size = size.x; + + if (size.y > max_size) + { + axis = new Vector3(0f, 1f, 0f); + max_size = size.y; + } + + if (size.z > max_size) + { + axis = new Vector3(0f, 0f, 1f); + max_size = size.z; + } + + return axis; + } + + public Vector3.Axis get_longest_axis_index() + { + Vector3.Axis axis = Vector3.Axis.X; + float max_size = size.x; + + if (size.y > max_size) + { + axis = Vector3.Axis.Y; + max_size = size.y; + } + + if (size.z > max_size) + { + axis = Vector3.Axis.Z; + max_size = size.z; + } + + return axis; + } + + public float get_longest_axis_size() + { + float max_size = size.x; + + if (size.y > max_size) + max_size = size.y; + + if (size.z > max_size) + max_size = size.z; + + return max_size; + } + + public Vector3 get_shortest_axis() + { + Vector3 axis = new Vector3(1f, 0f, 0f); + float max_size = size.x; + + if (size.y < max_size) + { + axis = new Vector3(0f, 1f, 0f); + max_size = size.y; + } + + if (size.z < max_size) + { + axis = new Vector3(0f, 0f, 1f); + max_size = size.z; + } + + return axis; + } + + public Vector3.Axis get_shortest_axis_index() + { + Vector3.Axis axis = Vector3.Axis.X; + float max_size = size.x; + + if (size.y < max_size) + { + axis = Vector3.Axis.Y; + max_size = size.y; + } + + if (size.z < max_size) + { + axis = Vector3.Axis.Z; + max_size = size.z; + } + + return axis; + } + + public float get_shortest_axis_size() + { + float max_size = size.x; + + if (size.y < max_size) + max_size = size.y; + + if (size.z < max_size) + max_size = size.z; + + return max_size; + } + + public Vector3 get_support(Vector3 dir) + { + Vector3 half_extents = size * 0.5f; + Vector3 ofs = position + half_extents; + + return ofs + new Vector3( + (dir.x > 0f) ? -half_extents.x : half_extents.x, + (dir.y > 0f) ? -half_extents.y : half_extents.y, + (dir.z > 0f) ? -half_extents.z : half_extents.z); + } + + public Rect3 grow(float by) + { + Rect3 res = this; + + res.position.x -= by; + res.position.y -= by; + res.position.z -= by; + res.size.x += 2.0f * by; + res.size.y += 2.0f * by; + res.size.z += 2.0f * by; + + return res; + } + + public bool has_no_area() + { + return size.x <= 0f || size.y <= 0f || size.z <= 0f; + } + + public bool has_no_surface() + { + return size.x <= 0f && size.y <= 0f && size.z <= 0f; + } + + public bool has_point(Vector3 point) + { + if (point.x < position.x) + return false; + if (point.y < position.y) + return false; + if (point.z < position.z) + return false; + if (point.x > position.x + size.x) + return false; + if (point.y > position.y + size.y) + return false; + if (point.z > position.z + size.z) + return false; + + return true; + } + + public Rect3 intersection(Rect3 with) + { + Vector3 src_min = position; + Vector3 src_max = position + size; + Vector3 dst_min = with.position; + Vector3 dst_max = with.position + with.size; + + Vector3 min, max; + + if (src_min.x > dst_max.x || src_max.x < dst_min.x) + { + return new Rect3(); + } + else + { + min.x = (src_min.x > dst_min.x) ? src_min.x : dst_min.x; + max.x = (src_max.x < dst_max.x) ? src_max.x : dst_max.x; + } + + if (src_min.y > dst_max.y || src_max.y < dst_min.y) + { + return new Rect3(); + } + else + { + min.y = (src_min.y > dst_min.y) ? src_min.y : dst_min.y; + max.y = (src_max.y < dst_max.y) ? src_max.y : dst_max.y; + } + + if (src_min.z > dst_max.z || src_max.z < dst_min.z) + { + return new Rect3(); + } + else + { + min.z = (src_min.z > dst_min.z) ? src_min.z : dst_min.z; + max.z = (src_max.z < dst_max.z) ? src_max.z : dst_max.z; + } + + return new Rect3(min, max - min); + } + + public bool intersects(Rect3 with) + { + if (position.x >= (with.position.x + with.size.x)) + return false; + if ((position.x + size.x) <= with.position.x) + return false; + if (position.y >= (with.position.y + with.size.y)) + return false; + if ((position.y + size.y) <= with.position.y) + return false; + if (position.z >= (with.position.z + with.size.z)) + return false; + if ((position.z + size.z) <= with.position.z) + return false; + + return true; + } + + public bool intersects_plane(Plane plane) + { + Vector3[] points = + { + new Vector3(position.x, position.y, position.z), + new Vector3(position.x, position.y, position.z + size.z), + new Vector3(position.x, position.y + size.y, position.z), + new Vector3(position.x, position.y + size.y, position.z + size.z), + new Vector3(position.x + size.x, position.y, position.z), + new Vector3(position.x + size.x, position.y, position.z + size.z), + new Vector3(position.x + size.x, position.y + size.y, position.z), + new Vector3(position.x + size.x, position.y + size.y, position.z + size.z), + }; + + bool over = false; + bool under = false; + + for (int i = 0; i < 8; i++) + { + if (plane.distance_to(points[i]) > 0) + over = true; + else + under = true; + } + + return under && over; + } + + public bool intersects_segment(Vector3 from, Vector3 to) + { + float min = 0f; + float max = 1f; + + for (int i = 0; i < 3; i++) + { + float seg_from = from[i]; + float seg_to = to[i]; + float box_begin = position[i]; + float box_end = box_begin + size[i]; + float cmin, cmax; + + if (seg_from < seg_to) + { + if (seg_from > box_end || seg_to < box_begin) + return false; + + float length = seg_to - seg_from; + cmin = seg_from < box_begin ? (box_begin - seg_from) / length : 0f; + cmax = seg_to > box_end ? (box_end - seg_from) / length : 1f; + } + else + { + if (seg_to > box_end || seg_from < box_begin) + return false; + + float length = seg_to - seg_from; + cmin = seg_from > box_end ? (box_end - seg_from) / length : 0f; + cmax = seg_to < box_begin ? (box_begin - seg_from) / length : 1f; + } + + if (cmin > min) + { + min = cmin; + } + + if (cmax < max) + max = cmax; + if (max < min) + return false; + } + + return true; + } + + public Rect3 merge(Rect3 with) + { + Vector3 beg_1 = position; + Vector3 beg_2 = with.position; + Vector3 end_1 = new Vector3(size.x, size.y, size.z) + beg_1; + Vector3 end_2 = new Vector3(with.size.x, with.size.y, with.size.z) + beg_2; + + Vector3 min = new Vector3( + (beg_1.x < beg_2.x) ? beg_1.x : beg_2.x, + (beg_1.y < beg_2.y) ? beg_1.y : beg_2.y, + (beg_1.z < beg_2.z) ? beg_1.z : beg_2.z + ); + + Vector3 max = new Vector3( + (end_1.x > end_2.x) ? end_1.x : end_2.x, + (end_1.y > end_2.y) ? end_1.y : end_2.y, + (end_1.z > end_2.z) ? end_1.z : end_2.z + ); + + return new Rect3(min, max - min); + } + + public Rect3(Vector3 position, Vector3 size) + { + this.position = position; + this.size = size; + } + + public static bool operator ==(Rect3 left, Rect3 right) + { + return left.Equals(right); + } + + public static bool operator !=(Rect3 left, Rect3 right) + { + return !left.Equals(right); + } + + public override bool Equals(object obj) + { + if (obj is Rect3) + { + return Equals((Rect3)obj); + } + + return false; + } + + public bool Equals(Rect3 other) + { + return position == other.position && size == other.size; + } + + public override int GetHashCode() + { + return position.GetHashCode() ^ size.GetHashCode(); + } + + public override string ToString() + { + return String.Format("{0} - {1}", new object[] + { + this.position.ToString(), + this.size.ToString() + }); + } + + public string ToString(string format) + { + return String.Format("{0} - {1}", new object[] + { + this.position.ToString(format), + this.size.ToString(format) + }); + } + } +} diff --git a/modules/mono/glue/cs_files/ToolAttribute.cs b/modules/mono/glue/cs_files/ToolAttribute.cs index 0275982c7f..d8601b5b32 100644 --- a/modules/mono/glue/cs_files/ToolAttribute.cs +++ b/modules/mono/glue/cs_files/ToolAttribute.cs @@ -1,4 +1,4 @@ -using System; +using System; namespace Godot { diff --git a/platform/android/java/gradlew.bat b/platform/android/java/gradlew.bat index aec99730b4..8a0b282aa6 100644 --- a/platform/android/java/gradlew.bat +++ b/platform/android/java/gradlew.bat @@ -1,90 +1,90 @@ -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS= - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto init - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:init -@rem Get command-line arguments, handling Windowz variants - -if not "%OS%" == "Windows_NT" goto win9xME_args -if "%@eval[2+2]" == "4" goto 4NT_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* -goto execute - -:4NT_args -@rem Get arguments from the 4NT Shell from JP Software -set CMD_LINE_ARGS=%$ - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windowz variants + +if not "%OS%" == "Windows_NT" goto win9xME_args +if "%@eval[2+2]" == "4" goto 4NT_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* +goto execute + +:4NT_args +@rem Get arguments from the 4NT Shell from JP Software +set CMD_LINE_ARGS=%$ + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/thirdparty/etc2comp/AUTHORS b/thirdparty/etc2comp/AUTHORS index 32daca27fe..e78a7f4d21 100644 --- a/thirdparty/etc2comp/AUTHORS +++ b/thirdparty/etc2comp/AUTHORS @@ -1,7 +1,7 @@ -# This is the list of Etc2Comp authors for copyright purposes. -# -# This does not necessarily list everyone who has contributed code, since in -# some cases, their employer may be the copyright holder. To see the full list -# of contributors, see the revision history in source control. -Google Inc. -Blue Shift Inc. +# This is the list of Etc2Comp authors for copyright purposes. +# +# This does not necessarily list everyone who has contributed code, since in +# some cases, their employer may be the copyright holder. To see the full list +# of contributors, see the revision history in source control. +Google Inc. +Blue Shift Inc. diff --git a/thirdparty/etc2comp/LICENSE b/thirdparty/etc2comp/LICENSE index 75b52484ea..d645695673 100644 --- a/thirdparty/etc2comp/LICENSE +++ b/thirdparty/etc2comp/LICENSE @@ -1,202 +1,202 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/thirdparty/etc2comp/README.md b/thirdparty/etc2comp/README.md index 1c70ae9f4e..2f4363d042 100644 --- a/thirdparty/etc2comp/README.md +++ b/thirdparty/etc2comp/README.md @@ -1,197 +1,197 @@ -# Etc2Comp - Texture to ETC2 compressor - -Etc2Comp is a command line tool that converts textures (e.g. bitmaps) -into the [ETC2](https://en.wikipedia.org/wiki/Ericsson_Texture_Compression) -format. The tool is built with a focus on encoding performance -to reduce the amount of time required to compile asset heavy applications as -well as reduce overall application size. - -This repo provides source code that can be compiled into a binary. The -binary can then be used to convert textures to the ETC2 format. - -Important: This is not an official Google product. It is an experimental -library published as-is. Please see the CONTRIBUTORS.md file for information -about questions or issues. - -## Setup -This project uses [CMake](https://cmake.org/) to generate platform-specific -build files: - - Linux: make files - - OS X: Xcode workspace files - - Microsoft Windows: Visual Studio solution files - - Note: CMake supports other formats, but this doc only provides steps for - one of each platform for brevity. - -Refer to each platform's setup section to setup your environment and build -an Etc2Comp binary. Then skip to the usage section of this page for examples -of how to use the library. - -### Setup for OS X - build tested on this config: - OS X 10.9.5 i7 16GB RAM - Xcode 5.1.1 - cmake 3.2.3 - -Start by downloading and installing the following components if they are not -already installed on your development machine. - - *Xcode* version 5.1.1, or greater - - [CMake](https://cmake.org/download/) version 3.2.3, or greater - -To build the Etc2Comp binary: - 1. Open a *Terminal* window and navigate to the project directory. - 1. Run `mkdir build_xcode` - 1. Run `cd build_xcode` - 1. Run `cmake -G Xcode ../` - 1. Open *Xcode* and import the `build_xcode/EtcTest.xcodeproj` file. - 1. Open the Product menu and choose Build For -> Running. - 1. Once the build succeeds the binary located at `build_xcode/EtcTool/Debug/EtcTool` -can be executed. - -Optional -Xcode EtcTool ‘Run’ preferences -note: if the build_xcode/EtcTest.xcodeproj is manually deleted then some Xcode preferences -will need to be set by hand after cmake is run (these prefs are retained across -cmake updates if the .xcodeproj is not deleted/removed) - -1. Set the active scheme to ‘EtcTool’ -1. Edit the scheme -1. Select option ‘Run EtcTool’, then tab ‘Arguments’. -Add this launch argument: ‘-argfile ../../EtcTool/args.txt’ -1. Select tab ‘Options’ and set a custom working directory to: ‘$(SRCROOT)/Build_Xcode/EtcTool’ - -### SetUp for Windows - -1. Open a *Terminal* window and navigate to the project directory. -1. Run `mkdir build_vs` -1. Run `cd build_vs` -1. Run CMAKE, noting what build version you need, and pointing to the parent directory as the source root; - For VS 2013 : `cmake -G "Visual Studio 12 2013 Win64" ../` - For VS 2015 : `cmake -G "Visual Studio 14 2015 Win64" ../` - NOTE: To see what supported Visual Studio outputs there are, run `cmake -G` -1. open the 'EtcTest' solution -1. make the 'EtcTool' project the start up project -1. (optional) in the project properties, under 'Debugging ->command arguments' -add the argfile textfile thats included in the EtcTool directory. -example: -argfile C:\etc2\EtcTool\Args.txt - -### Setup For Linux -The Linux build was tested on this config: - Ubuntu desktop 14.04 - gcc/g++ 4.8 - cmake 2.8.12.2 - -1. Verify linux has cmake and C++-11 capable g++ installed -1. Open shell -1. Run `mkdir build_linux` -1. Run `cd build_linux` -1. Run `cmake ../` -1. Run `make` -1. navigate to the newly created EtcTool directory `cd EtcTool` -1. run the executable: `./EtcTool -argfile ../../EtcTool/args.txt` - -Skip to the Usage section for more information about using the -tool. - -## Usage - -### Command Line Usage -EtcTool can be run from the command line with the following usage: - etctool.exe source_image [options ...] -output encoded_image - -The encoder will use an array of RGBA floats read from the source_image to create -an ETC1 or ETC2 encoded image in encoded_image. The RGBA floats should be in the -range [0:1]. - -Options: - - -analyze - -argfile additional command line arguments read from a file - -blockAtHV encodes a single block that contains the - pixel specified by the H V coordinates - -compare compares source_image to comparison_image - -effort number between 0 and 100 to specify the encoding quality - (100 is the highest quality) - -errormetric specify the error metric, the options are - rgba, rgbx, rec709, numeric and normalxyz - -format ETC1, RGB8, SRGB8, RGBA8, SRGB8, RGB8A1, - SRGB8A1 or R11 - -help prints this message - -jobs or -j specifies the number of threads (default=1) - -normalizexyz normalize RGB to have a length of 1 - -verbose or -v shows status information during the encoding - process - -mipmaps or -m sets the maximum number of mipaps to generate (default=1) - -mipwrap or -w sets the mipmap filter wrap mode (default=clamp) - -* -analyze will run an analysis of the encoding and place it in folder -"analysis_folder" (e.g. ../analysis/kodim05). within the analysis_folder, a folder -will be created with a name of the current date/time (e.g. 20151204_153306). this -date/time folder is used to compare encodings of the same texture over time. -within the date/time folder is a text file with several encoding stats and a 2x png -image showing the encoding mode for each 4x4 block. - -* -argfile allows additional command line arguments to be placed in a text file - -* -blockAtHV selects the 4x4 pixel subset of the source image at position (H,V). -This is mainly used for debugging - -* -compare compares the source image to the created encoded image. The encoding -will dictate what error analysis is used in the comparison. - -* -effort uses an "amount" between 0 and 100 to determine how much additional effort -to apply during the encoding. - -* -errormetric selects the fitting algorithm used by the encoder. "rgba" calculates -RMS error using RGB components that are weighted by A. "rgbx" calculates RMS error -using RGBA components, where A is treated as an additional data channel, instead of -as alpha. "rec709" is similar to "rgba", except the RGB components are also weighted -according to Rec709. "numeric" calculates RMS error using unweighted RGBA components. -"normalize" calculates error based on dot product and vector length for RGB and RMS -error for A. - -* -help prints out the usage message - -* -jobs enables multi-threading to speed up image encoding - -* -normalizexyz normalizes the source RGB to have a length of 1. - -* -verbose shows information on the current encoding process. It will then display the -PSNR and time time it took to encode the image. - -* -mipmaps takes an argument that specifies how many mipmaps to generate from the -source image. The mipmaps are generated with a lanczos3 filter using edge clamping. -If the mipmaps option is not specified no mipmaps are created. - -* -mipwrap takes an argument that specifies the mipmap filter wrap mode. The options -are "x", "y" and "xy" which specify wrapping in x only, y only or x and y respectively. -The default options are clamping in both x and y. - -Note: Path names can use slashes or backslashes. The tool will convert the -slashes to the appropriate polarity for the current platform. - - -## API - -The library supports two different APIs - a C-like API that is not heavily -class-based and a class-based API. - -main() in EtcTool.cpp contains an example of both APIs. - -The Encode() method now returns an EncodingStatus that contains bit flags for -reporting various warnings and flags encountered when encoding. - - -## Copyright -Copyright 2015 Etc2Comp Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. +# Etc2Comp - Texture to ETC2 compressor + +Etc2Comp is a command line tool that converts textures (e.g. bitmaps) +into the [ETC2](https://en.wikipedia.org/wiki/Ericsson_Texture_Compression) +format. The tool is built with a focus on encoding performance +to reduce the amount of time required to compile asset heavy applications as +well as reduce overall application size. + +This repo provides source code that can be compiled into a binary. The +binary can then be used to convert textures to the ETC2 format. + +Important: This is not an official Google product. It is an experimental +library published as-is. Please see the CONTRIBUTORS.md file for information +about questions or issues. + +## Setup +This project uses [CMake](https://cmake.org/) to generate platform-specific +build files: + - Linux: make files + - OS X: Xcode workspace files + - Microsoft Windows: Visual Studio solution files + - Note: CMake supports other formats, but this doc only provides steps for + one of each platform for brevity. + +Refer to each platform's setup section to setup your environment and build +an Etc2Comp binary. Then skip to the usage section of this page for examples +of how to use the library. + +### Setup for OS X + build tested on this config: + OS X 10.9.5 i7 16GB RAM + Xcode 5.1.1 + cmake 3.2.3 + +Start by downloading and installing the following components if they are not +already installed on your development machine. + - *Xcode* version 5.1.1, or greater + - [CMake](https://cmake.org/download/) version 3.2.3, or greater + +To build the Etc2Comp binary: + 1. Open a *Terminal* window and navigate to the project directory. + 1. Run `mkdir build_xcode` + 1. Run `cd build_xcode` + 1. Run `cmake -G Xcode ../` + 1. Open *Xcode* and import the `build_xcode/EtcTest.xcodeproj` file. + 1. Open the Product menu and choose Build For -> Running. + 1. Once the build succeeds the binary located at `build_xcode/EtcTool/Debug/EtcTool` +can be executed. + +Optional +Xcode EtcTool ‘Run’ preferences +note: if the build_xcode/EtcTest.xcodeproj is manually deleted then some Xcode preferences +will need to be set by hand after cmake is run (these prefs are retained across +cmake updates if the .xcodeproj is not deleted/removed) + +1. Set the active scheme to ‘EtcTool’ +1. Edit the scheme +1. Select option ‘Run EtcTool’, then tab ‘Arguments’. +Add this launch argument: ‘-argfile ../../EtcTool/args.txt’ +1. Select tab ‘Options’ and set a custom working directory to: ‘$(SRCROOT)/Build_Xcode/EtcTool’ + +### SetUp for Windows + +1. Open a *Terminal* window and navigate to the project directory. +1. Run `mkdir build_vs` +1. Run `cd build_vs` +1. Run CMAKE, noting what build version you need, and pointing to the parent directory as the source root; + For VS 2013 : `cmake -G "Visual Studio 12 2013 Win64" ../` + For VS 2015 : `cmake -G "Visual Studio 14 2015 Win64" ../` + NOTE: To see what supported Visual Studio outputs there are, run `cmake -G` +1. open the 'EtcTest' solution +1. make the 'EtcTool' project the start up project +1. (optional) in the project properties, under 'Debugging ->command arguments' +add the argfile textfile thats included in the EtcTool directory. +example: -argfile C:\etc2\EtcTool\Args.txt + +### Setup For Linux +The Linux build was tested on this config: + Ubuntu desktop 14.04 + gcc/g++ 4.8 + cmake 2.8.12.2 + +1. Verify linux has cmake and C++-11 capable g++ installed +1. Open shell +1. Run `mkdir build_linux` +1. Run `cd build_linux` +1. Run `cmake ../` +1. Run `make` +1. navigate to the newly created EtcTool directory `cd EtcTool` +1. run the executable: `./EtcTool -argfile ../../EtcTool/args.txt` + +Skip to the Usage section for more information about using the +tool. + +## Usage + +### Command Line Usage +EtcTool can be run from the command line with the following usage: + etctool.exe source_image [options ...] -output encoded_image + +The encoder will use an array of RGBA floats read from the source_image to create +an ETC1 or ETC2 encoded image in encoded_image. The RGBA floats should be in the +range [0:1]. + +Options: + + -analyze + -argfile additional command line arguments read from a file + -blockAtHV encodes a single block that contains the + pixel specified by the H V coordinates + -compare compares source_image to comparison_image + -effort number between 0 and 100 to specify the encoding quality + (100 is the highest quality) + -errormetric specify the error metric, the options are + rgba, rgbx, rec709, numeric and normalxyz + -format ETC1, RGB8, SRGB8, RGBA8, SRGB8, RGB8A1, + SRGB8A1 or R11 + -help prints this message + -jobs or -j specifies the number of threads (default=1) + -normalizexyz normalize RGB to have a length of 1 + -verbose or -v shows status information during the encoding + process + -mipmaps or -m sets the maximum number of mipaps to generate (default=1) + -mipwrap or -w sets the mipmap filter wrap mode (default=clamp) + +* -analyze will run an analysis of the encoding and place it in folder +"analysis_folder" (e.g. ../analysis/kodim05). within the analysis_folder, a folder +will be created with a name of the current date/time (e.g. 20151204_153306). this +date/time folder is used to compare encodings of the same texture over time. +within the date/time folder is a text file with several encoding stats and a 2x png +image showing the encoding mode for each 4x4 block. + +* -argfile allows additional command line arguments to be placed in a text file + +* -blockAtHV selects the 4x4 pixel subset of the source image at position (H,V). +This is mainly used for debugging + +* -compare compares the source image to the created encoded image. The encoding +will dictate what error analysis is used in the comparison. + +* -effort uses an "amount" between 0 and 100 to determine how much additional effort +to apply during the encoding. + +* -errormetric selects the fitting algorithm used by the encoder. "rgba" calculates +RMS error using RGB components that are weighted by A. "rgbx" calculates RMS error +using RGBA components, where A is treated as an additional data channel, instead of +as alpha. "rec709" is similar to "rgba", except the RGB components are also weighted +according to Rec709. "numeric" calculates RMS error using unweighted RGBA components. +"normalize" calculates error based on dot product and vector length for RGB and RMS +error for A. + +* -help prints out the usage message + +* -jobs enables multi-threading to speed up image encoding + +* -normalizexyz normalizes the source RGB to have a length of 1. + +* -verbose shows information on the current encoding process. It will then display the +PSNR and time time it took to encode the image. + +* -mipmaps takes an argument that specifies how many mipmaps to generate from the +source image. The mipmaps are generated with a lanczos3 filter using edge clamping. +If the mipmaps option is not specified no mipmaps are created. + +* -mipwrap takes an argument that specifies the mipmap filter wrap mode. The options +are "x", "y" and "xy" which specify wrapping in x only, y only or x and y respectively. +The default options are clamping in both x and y. + +Note: Path names can use slashes or backslashes. The tool will convert the +slashes to the appropriate polarity for the current platform. + + +## API + +The library supports two different APIs - a C-like API that is not heavily +class-based and a class-based API. + +main() in EtcTool.cpp contains an example of both APIs. + +The Encode() method now returns an EncodingStatus that contains bit flags for +reporting various warnings and flags encountered when encoding. + + +## Copyright +Copyright 2015 Etc2Comp Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c index ac9dacf377..94f1d06513 100644 --- a/thirdparty/libtheora/x86_vc/mmxencfrag.c +++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c @@ -1,969 +1,969 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * - * by the Xiph.Org Foundation http://www.xiph.org/ * - * * - ******************************************************************** - - function: - last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ - - ********************************************************************/ -#include -#include "x86enc.h" - -#if defined(OC_X86_ASM) - -unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, - const unsigned char *_ref,int _ystride){ - ptrdiff_t ret; - __asm{ -#define SRC esi -#define REF edx -#define YSTRIDE ecx -#define YSTRIDE3 edi - mov YSTRIDE,_ystride - mov SRC,_src - mov REF,_ref - /*Load the first 4 rows of each block.*/ - movq mm0,[SRC] - movq mm1,[REF] - movq mm2,[SRC][YSTRIDE] - movq mm3,[REF][YSTRIDE] - lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] - movq mm4,[SRC+YSTRIDE*2] - movq mm5,[REF+YSTRIDE*2] - movq mm6,[SRC+YSTRIDE3] - movq mm7,[REF+YSTRIDE3] - /*Compute their SADs and add them in mm0*/ - psadbw mm0,mm1 - psadbw mm2,mm3 - lea SRC,[SRC+YSTRIDE*4] - paddw mm0,mm2 - lea REF,[REF+YSTRIDE*4] - /*Load the next 3 rows as registers become available.*/ - movq mm2,[SRC] - movq mm3,[REF] - psadbw mm4,mm5 - psadbw mm6,mm7 - paddw mm0,mm4 - movq mm5,[REF+YSTRIDE] - movq mm4,[SRC+YSTRIDE] - paddw mm0,mm6 - movq mm7,[REF+YSTRIDE*2] - movq mm6,[SRC+YSTRIDE*2] - /*Start adding their SADs to mm0*/ - psadbw mm2,mm3 - psadbw mm4,mm5 - paddw mm0,mm2 - psadbw mm6,mm7 - /*Load last row as registers become available.*/ - movq mm2,[SRC+YSTRIDE3] - movq mm3,[REF+YSTRIDE3] - /*And finish adding up their SADs.*/ - paddw mm0,mm4 - psadbw mm2,mm3 - paddw mm0,mm6 - paddw mm0,mm2 - movd [ret],mm0 -#undef SRC -#undef REF -#undef YSTRIDE -#undef YSTRIDE3 - } - return (unsigned)ret; -} - -unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh){ - /*Early termination is for suckers.*/ - return oc_enc_frag_sad_mmxext(_src,_ref,_ystride); -} - -#define OC_SAD2_LOOP __asm{ \ - /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \ - pavgb computes (mm0+mm1+1>>1). \ - The latter is exactly 1 too large when the low bit of two corresponding \ - bytes is only set in one of them. \ - Therefore we pxor the operands, pand to mask out the low bits, and psubb to \ - correct the output of pavgb.*/ \ - __asm movq mm6,mm0 \ - __asm lea REF1,[REF1+YSTRIDE*2] \ - __asm pxor mm0,mm1 \ - __asm pavgb mm6,mm1 \ - __asm lea REF2,[REF2+YSTRIDE*2] \ - __asm movq mm1,mm2 \ - __asm pand mm0,mm7 \ - __asm pavgb mm2,mm3 \ - __asm pxor mm1,mm3 \ - __asm movq mm3,[REF2+YSTRIDE] \ - __asm psubb mm6,mm0 \ - __asm movq mm0,[REF1] \ - __asm pand mm1,mm7 \ - __asm psadbw mm4,mm6 \ - __asm movd mm6,RET \ - __asm psubb mm2,mm1 \ - __asm movq mm1,[REF2] \ - __asm lea SRC,[SRC+YSTRIDE*2] \ - __asm psadbw mm5,mm2 \ - __asm movq mm2,[REF1+YSTRIDE] \ - __asm paddw mm5,mm4 \ - __asm movq mm4,[SRC] \ - __asm paddw mm6,mm5 \ - __asm movq mm5,[SRC+YSTRIDE] \ - __asm movd RET,mm6 \ -} - -/*Same as above, but does not pre-load the next two rows.*/ -#define OC_SAD2_TAIL __asm{ \ - __asm movq mm6,mm0 \ - __asm pavgb mm0,mm1 \ - __asm pxor mm6,mm1 \ - __asm movq mm1,mm2 \ - __asm pand mm6,mm7 \ - __asm pavgb mm2,mm3 \ - __asm pxor mm1,mm3 \ - __asm psubb mm0,mm6 \ - __asm pand mm1,mm7 \ - __asm psadbw mm4,mm0 \ - __asm psubb mm2,mm1 \ - __asm movd mm6,RET \ - __asm psadbw mm5,mm2 \ - __asm paddw mm5,mm4 \ - __asm paddw mm6,mm5 \ - __asm movd RET,mm6 \ -} - -unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh){ - ptrdiff_t ret; - __asm{ -#define REF1 ecx -#define REF2 edi -#define YSTRIDE esi -#define SRC edx -#define RET eax - mov YSTRIDE,_ystride - mov SRC,_src - mov REF1,_ref1 - mov REF2,_ref2 - movq mm0,[REF1] - movq mm1,[REF2] - movq mm2,[REF1+YSTRIDE] - movq mm3,[REF2+YSTRIDE] - xor RET,RET - movq mm4,[SRC] - pxor mm7,mm7 - pcmpeqb mm6,mm6 - movq mm5,[SRC+YSTRIDE] - psubb mm7,mm6 - OC_SAD2_LOOP - OC_SAD2_LOOP - OC_SAD2_LOOP - OC_SAD2_TAIL - mov [ret],RET -#undef REF1 -#undef REF2 -#undef YSTRIDE -#undef SRC -#undef RET - } - return (unsigned)ret; -} - -/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their - 16-bit difference in mm0...mm7.*/ -#define OC_LOAD_SUB_8x4(_off) __asm{ \ - __asm movd mm0,[_off+SRC] \ - __asm movd mm4,[_off+REF] \ - __asm movd mm1,[_off+SRC+SRC_YSTRIDE] \ - __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ - __asm movd mm5,[_off+REF+REF_YSTRIDE] \ - __asm lea REF,[REF+REF_YSTRIDE*2] \ - __asm movd mm2,[_off+SRC] \ - __asm movd mm7,[_off+REF] \ - __asm movd mm3,[_off+SRC+SRC_YSTRIDE] \ - __asm movd mm6,[_off+REF+REF_YSTRIDE] \ - __asm punpcklbw mm0,mm4 \ - __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ - __asm punpcklbw mm4,mm4 \ - __asm lea REF,[REF+REF_YSTRIDE*2] \ - __asm psubw mm0,mm4 \ - __asm movd mm4,[_off+SRC] \ - __asm movq [_off*2+BUF],mm0 \ - __asm movd mm0,[_off+REF] \ - __asm punpcklbw mm1,mm5 \ - __asm punpcklbw mm5,mm5 \ - __asm psubw mm1,mm5 \ - __asm movd mm5,[_off+SRC+SRC_YSTRIDE] \ - __asm punpcklbw mm2,mm7 \ - __asm punpcklbw mm7,mm7 \ - __asm psubw mm2,mm7 \ - __asm movd mm7,[_off+REF+REF_YSTRIDE] \ - __asm punpcklbw mm3,mm6 \ - __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ - __asm punpcklbw mm6,mm6 \ - __asm psubw mm3,mm6 \ - __asm movd mm6,[_off+SRC] \ - __asm punpcklbw mm4,mm0 \ - __asm lea REF,[REF+REF_YSTRIDE*2] \ - __asm punpcklbw mm0,mm0 \ - __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ - __asm psubw mm4,mm0 \ - __asm movd mm0,[_off+REF] \ - __asm punpcklbw mm5,mm7 \ - __asm neg SRC_YSTRIDE \ - __asm punpcklbw mm7,mm7 \ - __asm psubw mm5,mm7 \ - __asm movd mm7,[_off+SRC+SRC_YSTRIDE] \ - __asm punpcklbw mm6,mm0 \ - __asm lea REF,[REF+REF_YSTRIDE*2] \ - __asm punpcklbw mm0,mm0 \ - __asm neg REF_YSTRIDE \ - __asm psubw mm6,mm0 \ - __asm movd mm0,[_off+REF+REF_YSTRIDE] \ - __asm lea SRC,[SRC+SRC_YSTRIDE*8] \ - __asm punpcklbw mm7,mm0 \ - __asm neg SRC_YSTRIDE \ - __asm punpcklbw mm0,mm0 \ - __asm lea REF,[REF+REF_YSTRIDE*8] \ - __asm psubw mm7,mm0 \ - __asm neg REF_YSTRIDE \ - __asm movq mm0,[_off*2+BUF] \ -} - -/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/ -#define OC_LOAD_8x4(_off) __asm{ \ - __asm movd mm0,[_off+SRC] \ - __asm movd mm1,[_off+SRC+YSTRIDE] \ - __asm movd mm2,[_off+SRC+YSTRIDE*2] \ - __asm pxor mm7,mm7 \ - __asm movd mm3,[_off+SRC+YSTRIDE3] \ - __asm punpcklbw mm0,mm7 \ - __asm movd mm4,[_off+SRC4] \ - __asm punpcklbw mm1,mm7 \ - __asm movd mm5,[_off+SRC4+YSTRIDE] \ - __asm punpcklbw mm2,mm7 \ - __asm movd mm6,[_off+SRC4+YSTRIDE*2] \ - __asm punpcklbw mm3,mm7 \ - __asm movd mm7,[_off+SRC4+YSTRIDE3] \ - __asm punpcklbw mm4,mm4 \ - __asm punpcklbw mm5,mm5 \ - __asm psrlw mm4,8 \ - __asm psrlw mm5,8 \ - __asm punpcklbw mm6,mm6 \ - __asm punpcklbw mm7,mm7 \ - __asm psrlw mm6,8 \ - __asm psrlw mm7,8 \ -} - -/*Performs the first two stages of an 8-point 1-D Hadamard transform. - The transform is performed in place, except that outputs 0-3 are swapped with - outputs 4-7. - Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to - perform this stage in place with no temporary registers).*/ -#define OC_HADAMARD_AB_8x4 __asm{ \ - /*Stage A: \ - Outputs 0-3 are swapped with 4-7 here.*/ \ - __asm paddw mm5,mm1 \ - __asm paddw mm6,mm2 \ - __asm paddw mm1,mm1 \ - __asm paddw mm2,mm2 \ - __asm psubw mm1,mm5 \ - __asm psubw mm2,mm6 \ - __asm paddw mm7,mm3 \ - __asm paddw mm4,mm0 \ - __asm paddw mm3,mm3 \ - __asm paddw mm0,mm0 \ - __asm psubw mm3,mm7 \ - __asm psubw mm0,mm4 \ - /*Stage B:*/ \ - __asm paddw mm0,mm2 \ - __asm paddw mm1,mm3 \ - __asm paddw mm4,mm6 \ - __asm paddw mm5,mm7 \ - __asm paddw mm2,mm2 \ - __asm paddw mm3,mm3 \ - __asm paddw mm6,mm6 \ - __asm paddw mm7,mm7 \ - __asm psubw mm2,mm0 \ - __asm psubw mm3,mm1 \ - __asm psubw mm6,mm4 \ - __asm psubw mm7,mm5 \ -} - -/*Performs the last stage of an 8-point 1-D Hadamard transform in place. - Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in - place with no temporary registers).*/ -#define OC_HADAMARD_C_8x4 __asm{ \ - /*Stage C:*/ \ - __asm paddw mm0,mm1 \ - __asm paddw mm2,mm3 \ - __asm paddw mm4,mm5 \ - __asm paddw mm6,mm7 \ - __asm paddw mm1,mm1 \ - __asm paddw mm3,mm3 \ - __asm paddw mm5,mm5 \ - __asm paddw mm7,mm7 \ - __asm psubw mm1,mm0 \ - __asm psubw mm3,mm2 \ - __asm psubw mm5,mm4 \ - __asm psubw mm7,mm6 \ -} - -/*Performs an 8-point 1-D Hadamard transform. - The transform is performed in place, except that outputs 0-3 are swapped with - outputs 4-7. - Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform - in place with no temporary registers).*/ -#define OC_HADAMARD_8x4 __asm{ \ - OC_HADAMARD_AB_8x4 \ - OC_HADAMARD_C_8x4 \ -} - -/*Performs the first part of the final stage of the Hadamard transform and - summing of absolute values. - At the end of this part, mm1 will contain the DC coefficient of the - transform.*/ -#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \ - /*We use the fact that \ - (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \ - to merge the final butterfly with the abs and the first stage of \ - accumulation. \ - Thus we can avoid using pabsw, which is not available until SSSE3. \ - Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \ - implementation would be (3+3)*8+7=55 instructions (+4 for spilling \ - registers). \ - Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ - This implementation is only 26 (+4 for spilling registers).*/ \ - __asm movq [_r7+BUF],mm7 \ - __asm movq [_r6+BUF],mm6 \ - /*mm7={0x7FFF}x4 \ - mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \ - __asm pcmpeqb mm7,mm7 \ - __asm movq mm6,mm0 \ - __asm psrlw mm7,1 \ - __asm paddw mm6,mm1 \ - __asm pmaxsw mm0,mm1 \ - __asm paddsw mm6,mm7 \ - __asm psubw mm0,mm6 \ - /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \ - mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \ - __asm movq mm6,mm2 \ - __asm movq mm1,mm4 \ - __asm pmaxsw mm2,mm3 \ - __asm pmaxsw mm4,mm5 \ - __asm paddw mm6,mm3 \ - __asm paddw mm1,mm5 \ - __asm movq mm3,[_r7+BUF] \ -} - -/*Performs the second part of the final stage of the Hadamard transform and - summing of absolute values.*/ -#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \ - __asm paddsw mm6,mm7 \ - __asm movq mm5,[_r6+BUF] \ - __asm paddsw mm1,mm7 \ - __asm psubw mm2,mm6 \ - __asm psubw mm4,mm1 \ - /*mm7={1}x4 (needed for the horizontal add that follows) \ - mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \ - __asm movq mm6,mm3 \ - __asm pmaxsw mm3,mm5 \ - __asm paddw mm0,mm2 \ - __asm paddw mm6,mm5 \ - __asm paddw mm0,mm4 \ - __asm paddsw mm6,mm7 \ - __asm paddw mm0,mm3 \ - __asm psrlw mm7,14 \ - __asm psubw mm0,mm6 \ -} - -/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the - absolute value of each component, and accumulates everything into mm0. - This is the only portion of SATD which requires MMXEXT (we could use plain - MMX, but it takes 4 instructions and an extra register to work around the - lack of a pmaxsw, which is a pretty serious penalty).*/ -#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \ - OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \ - OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ -} - -/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each - component, and accumulates everything into mm0. - Note that mm0 will have an extra 4 added to each column, and that after - removing this value, the remainder will be half the conventional value.*/ -#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \ - OC_HADAMARD_AB_8x4 \ - OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \ -} - -/*Performs two 4x4 transposes (mostly) in place. - On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7} - contains rows {a,b,c,d}. - On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and - {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/ -#define OC_TRANSPOSE_4x4x2(_off) __asm{ \ - /*First 4x4 transpose:*/ \ - __asm movq [0x10+_off+BUF],mm5 \ - /*mm0 = e3 e2 e1 e0 \ - mm1 = f3 f2 f1 f0 \ - mm2 = g3 g2 g1 g0 \ - mm3 = h3 h2 h1 h0*/ \ - __asm movq mm5,mm2 \ - __asm punpcklwd mm2,mm3 \ - __asm punpckhwd mm5,mm3 \ - __asm movq mm3,mm0 \ - __asm punpcklwd mm0,mm1 \ - __asm punpckhwd mm3,mm1 \ - /*mm0 = f1 e1 f0 e0 \ - mm3 = f3 e3 f2 e2 \ - mm2 = h1 g1 h0 g0 \ - mm5 = h3 g3 h2 g2*/ \ - __asm movq mm1,mm0 \ - __asm punpckldq mm0,mm2 \ - __asm punpckhdq mm1,mm2 \ - __asm movq mm2,mm3 \ - __asm punpckhdq mm3,mm5 \ - __asm movq [0x40+_off+BUF],mm0 \ - __asm punpckldq mm2,mm5 \ - /*mm0 = h0 g0 f0 e0 \ - mm1 = h1 g1 f1 e1 \ - mm2 = h2 g2 f2 e2 \ - mm3 = h3 g3 f3 e3*/ \ - __asm movq mm5,[0x10+_off+BUF] \ - /*Second 4x4 transpose:*/ \ - /*mm4 = a3 a2 a1 a0 \ - mm5 = b3 b2 b1 b0 \ - mm6 = c3 c2 c1 c0 \ - mm7 = d3 d2 d1 d0*/ \ - __asm movq mm0,mm6 \ - __asm punpcklwd mm6,mm7 \ - __asm movq [0x50+_off+BUF],mm1 \ - __asm punpckhwd mm0,mm7 \ - __asm movq mm7,mm4 \ - __asm punpcklwd mm4,mm5 \ - __asm movq [0x60+_off+BUF],mm2 \ - __asm punpckhwd mm7,mm5 \ - /*mm4 = b1 a1 b0 a0 \ - mm7 = b3 a3 b2 a2 \ - mm6 = d1 c1 d0 c0 \ - mm0 = d3 c3 d2 c2*/ \ - __asm movq mm5,mm4 \ - __asm punpckldq mm4,mm6 \ - __asm movq [0x70+_off+BUF],mm3 \ - __asm punpckhdq mm5,mm6 \ - __asm movq mm6,mm7 \ - __asm punpckhdq mm7,mm0 \ - __asm punpckldq mm6,mm0 \ - /*mm4 = d0 c0 b0 a0 \ - mm5 = d1 c1 b1 a1 \ - mm6 = d2 c2 b2 a2 \ - mm7 = d3 c3 b3 a3*/ \ -} - -static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, - int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){ - OC_ALIGN8(ogg_int16_t buf[64]); - ogg_int16_t *bufp; - unsigned ret1; - unsigned ret2; - bufp=buf; - __asm{ -#define SRC esi -#define REF eax -#define SRC_YSTRIDE ecx -#define REF_YSTRIDE edx -#define BUF edi -#define RET eax -#define RET2 edx - mov SRC,_src - mov SRC_YSTRIDE,_src_ystride - mov REF,_ref - mov REF_YSTRIDE,_ref_ystride - mov BUF,bufp - OC_LOAD_SUB_8x4(0x00) - OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2(0x00) - /*Finish swapping out this 8x4 block to make room for the next one. - mm0...mm3 have been swapped out already.*/ - movq [0x00+BUF],mm4 - movq [0x10+BUF],mm5 - movq [0x20+BUF],mm6 - movq [0x30+BUF],mm7 - OC_LOAD_SUB_8x4(0x04) - OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2(0x08) - /*Here the first 4x4 block of output from the last transpose is the second - 4x4 block of input for the next transform. - We have cleverly arranged that it already be in the appropriate place, so - we only have to do half the loads.*/ - movq mm1,[0x10+BUF] - movq mm2,[0x20+BUF] - movq mm3,[0x30+BUF] - movq mm0,[0x00+BUF] - OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38) - /*Up to this point, everything fit in 16 bits (8 input + 1 for the - difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 - for the factor of two we dropped + 3 for the vertical accumulation). - Now we finally have to promote things to dwords. - We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long - latency of pmaddwd by starting the next series of loads now.*/ - mov RET2,_thresh - pmaddwd mm0,mm7 - movq mm1,[0x50+BUF] - movq mm5,[0x58+BUF] - movq mm4,mm0 - movq mm2,[0x60+BUF] - punpckhdq mm0,mm0 - movq mm6,[0x68+BUF] - paddd mm4,mm0 - movq mm3,[0x70+BUF] - movd RET,mm4 - movq mm7,[0x78+BUF] - /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 - added to them, and a factor of two removed; correct the final sum here.*/ - lea RET,[RET+RET-32] - movq mm0,[0x40+BUF] - cmp RET,RET2 - movq mm4,[0x48+BUF] - jae at_end - OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) - pmaddwd mm0,mm7 - /*There isn't much to stick in here to hide the latency this time, but the - alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose - latency is even worse.*/ - sub RET,32 - movq mm4,mm0 - punpckhdq mm0,mm0 - paddd mm4,mm0 - movd RET2,mm4 - lea RET,[RET+RET2*2] - align 16 -at_end: - mov ret1,RET -#undef SRC -#undef REF -#undef SRC_YSTRIDE -#undef REF_YSTRIDE -#undef BUF -#undef RET -#undef RET2 - } - return ret1; -} - -unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh){ - return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh); -} - - -/*Our internal implementation of frag_copy2 takes an extra stride parameter so - we can share code with oc_enc_frag_satd2_thresh_mmxext().*/ -static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, - const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){ - __asm{ - /*Load the first 3 rows.*/ -#define DST_YSTRIDE edi -#define SRC_YSTRIDE esi -#define DST eax -#define SRC1 edx -#define SRC2 ecx - mov DST_YSTRIDE,_dst_ystride - mov SRC_YSTRIDE,_src_ystride - mov DST,_dst - mov SRC1,_src1 - mov SRC2,_src2 - movq mm0,[SRC1] - movq mm1,[SRC2] - movq mm2,[SRC1+SRC_YSTRIDE] - lea SRC1,[SRC1+SRC_YSTRIDE*2] - movq mm3,[SRC2+SRC_YSTRIDE] - lea SRC2,[SRC2+SRC_YSTRIDE*2] - pxor mm7,mm7 - movq mm4,[SRC1] - pcmpeqb mm6,mm6 - movq mm5,[SRC2] - /*mm7={1}x8.*/ - psubb mm7,mm6 - /*Start averaging mm0 and mm1 into mm6.*/ - movq mm6,mm0 - pxor mm0,mm1 - pavgb mm6,mm1 - /*mm1 is free, start averaging mm3 into mm2 using mm1.*/ - movq mm1,mm2 - pand mm0,mm7 - pavgb mm2,mm3 - pxor mm1,mm3 - /*mm3 is free.*/ - psubb mm6,mm0 - /*mm0 is free, start loading the next row.*/ - movq mm0,[SRC1+SRC_YSTRIDE] - /*Start averaging mm5 and mm4 using mm3.*/ - movq mm3,mm4 - /*mm6 [row 0] is done; write it out.*/ - movq [DST],mm6 - pand mm1,mm7 - pavgb mm4,mm5 - psubb mm2,mm1 - /*mm1 is free, continue loading the next row.*/ - movq mm1,[SRC2+SRC_YSTRIDE] - pxor mm3,mm5 - lea SRC1,[SRC1+SRC_YSTRIDE*2] - /*mm2 [row 1] is done; write it out.*/ - movq [DST+DST_YSTRIDE],mm2 - pand mm3,mm7 - /*Start loading the next row.*/ - movq mm2,[SRC1] - lea DST,[DST+DST_YSTRIDE*2] - psubb mm4,mm3 - lea SRC2,[SRC2+SRC_YSTRIDE*2] - /*mm4 [row 2] is done; write it out.*/ - movq [DST],mm4 - /*Continue loading the next row.*/ - movq mm3,[SRC2] - /*Start averaging mm0 and mm1 into mm6.*/ - movq mm6,mm0 - pxor mm0,mm1 - /*Start loading the next row.*/ - movq mm4,[SRC1+SRC_YSTRIDE] - pavgb mm6,mm1 - /*mm1 is free; start averaging mm3 into mm2 using mm1.*/ - movq mm1,mm2 - pand mm0,mm7 - /*Continue loading the next row.*/ - movq mm5,[SRC2+SRC_YSTRIDE] - pavgb mm2,mm3 - lea SRC1,[SRC1+SRC_YSTRIDE*2] - pxor mm1,mm3 - /*mm3 is free.*/ - psubb mm6,mm0 - /*mm0 is free, start loading the next row.*/ - movq mm0,[SRC1] - /*Start averaging mm5 into mm4 using mm3.*/ - movq mm3,mm4 - /*mm6 [row 3] is done; write it out.*/ - movq [DST+DST_YSTRIDE],mm6 - pand mm1,mm7 - lea SRC2,[SRC2+SRC_YSTRIDE*2] - pavgb mm4,mm5 - lea DST,[DST+DST_YSTRIDE*2] - psubb mm2,mm1 - /*mm1 is free; continue loading the next row.*/ - movq mm1,[SRC2] - pxor mm3,mm5 - /*mm2 [row 4] is done; write it out.*/ - movq [DST],mm2 - pand mm3,mm7 - /*Start loading the next row.*/ - movq mm2,[SRC1+SRC_YSTRIDE] - psubb mm4,mm3 - /*Start averaging mm0 and mm1 into mm6.*/ - movq mm6,mm0 - /*Continue loading the next row.*/ - movq mm3,[SRC2+SRC_YSTRIDE] - /*mm4 [row 5] is done; write it out.*/ - movq [DST+DST_YSTRIDE],mm4 - pxor mm0,mm1 - pavgb mm6,mm1 - /*mm4 is free; start averaging mm3 into mm2 using mm4.*/ - movq mm4,mm2 - pand mm0,mm7 - pavgb mm2,mm3 - pxor mm4,mm3 - lea DST,[DST+DST_YSTRIDE*2] - psubb mm6,mm0 - pand mm4,mm7 - /*mm6 [row 6] is done, write it out.*/ - movq [DST],mm6 - psubb mm2,mm4 - /*mm2 [row 7] is done, write it out.*/ - movq [DST+DST_YSTRIDE],mm2 -#undef SRC1 -#undef SRC2 -#undef SRC_YSTRIDE -#undef DST_YSTRIDE -#undef DST - } -} - -unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh){ - OC_ALIGN8(unsigned char ref[64]); - oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); - return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh); -} - -unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, - int _ystride){ - OC_ALIGN8(ogg_int16_t buf[64]); - ogg_int16_t *bufp; - unsigned ret1; - unsigned ret2; - bufp=buf; - __asm{ -#define SRC eax -#define SRC4 esi -#define BUF edi -#define RET eax -#define RET_WORD ax -#define RET2 ecx -#define YSTRIDE edx -#define YSTRIDE3 ecx - mov SRC,_src - mov BUF,bufp - mov YSTRIDE,_ystride - /* src4 = src+4*ystride */ - lea SRC4,[SRC+YSTRIDE*4] - /* ystride3 = 3*ystride */ - lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] - OC_LOAD_8x4(0x00) - OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2(0x00) - /*Finish swapping out this 8x4 block to make room for the next one. - mm0...mm3 have been swapped out already.*/ - movq [0x00+BUF],mm4 - movq [0x10+BUF],mm5 - movq [0x20+BUF],mm6 - movq [0x30+BUF],mm7 - OC_LOAD_8x4(0x04) - OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2(0x08) - /*Here the first 4x4 block of output from the last transpose is the second - 4x4 block of input for the next transform. - We have cleverly arranged that it already be in the appropriate place, so - we only have to do half the loads.*/ - movq mm1,[0x10+BUF] - movq mm2,[0x20+BUF] - movq mm3,[0x30+BUF] - movq mm0,[0x00+BUF] - /*We split out the stages here so we can save the DC coefficient in the - middle.*/ - OC_HADAMARD_AB_8x4 - OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) - movd RET,mm1 - OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) - /*Up to this point, everything fit in 16 bits (8 input + 1 for the - difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 - for the factor of two we dropped + 3 for the vertical accumulation). - Now we finally have to promote things to dwords. - We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long - latency of pmaddwd by starting the next series of loads now.*/ - pmaddwd mm0,mm7 - movq mm1,[0x50+BUF] - movq mm5,[0x58+BUF] - movq mm2,[0x60+BUF] - movq mm4,mm0 - movq mm6,[0x68+BUF] - punpckhdq mm0,mm0 - movq mm3,[0x70+BUF] - paddd mm4,mm0 - movq mm7,[0x78+BUF] - movd RET2,mm4 - movq mm0,[0x40+BUF] - movq mm4,[0x48+BUF] - OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) - pmaddwd mm0,mm7 - /*We assume that the DC coefficient is always positive (which is true, - because the input to the INTRA transform was not a difference).*/ - movzx RET,RET_WORD - add RET2,RET2 - sub RET2,RET - movq mm4,mm0 - punpckhdq mm0,mm0 - paddd mm4,mm0 - movd RET,mm4 - lea RET,[-64+RET2+RET*2] - mov [ret1],RET -#undef SRC -#undef SRC4 -#undef BUF -#undef RET -#undef RET_WORD -#undef RET2 -#undef YSTRIDE -#undef YSTRIDE3 - } - return ret1; -} - -void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64], - const unsigned char *_src, const unsigned char *_ref,int _ystride){ - int i; - __asm pxor mm7,mm7 - for(i=4;i-->0;){ - __asm{ -#define SRC edx -#define YSTRIDE esi -#define RESIDUE eax -#define REF ecx - mov YSTRIDE,_ystride - mov RESIDUE,_residue - mov SRC,_src - mov REF,_ref - /*mm0=[src]*/ - movq mm0,[SRC] - /*mm1=[ref]*/ - movq mm1,[REF] - /*mm4=[src+ystride]*/ - movq mm4,[SRC+YSTRIDE] - /*mm5=[ref+ystride]*/ - movq mm5,[REF+YSTRIDE] - /*Compute [src]-[ref].*/ - movq mm2,mm0 - punpcklbw mm0,mm7 - movq mm3,mm1 - punpckhbw mm2,mm7 - punpcklbw mm1,mm7 - punpckhbw mm3,mm7 - psubw mm0,mm1 - psubw mm2,mm3 - /*Compute [src+ystride]-[ref+ystride].*/ - movq mm1,mm4 - punpcklbw mm4,mm7 - movq mm3,mm5 - punpckhbw mm1,mm7 - lea SRC,[SRC+YSTRIDE*2] - punpcklbw mm5,mm7 - lea REF,[REF+YSTRIDE*2] - punpckhbw mm3,mm7 - psubw mm4,mm5 - psubw mm1,mm3 - /*Write the answer out.*/ - movq [RESIDUE+0x00],mm0 - movq [RESIDUE+0x08],mm2 - movq [RESIDUE+0x10],mm4 - movq [RESIDUE+0x18],mm1 - lea RESIDUE,[RESIDUE+0x20] - mov _residue,RESIDUE - mov _src,SRC - mov _ref,REF -#undef SRC -#undef YSTRIDE -#undef RESIDUE -#undef REF - } - } -} - -void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64], - const unsigned char *_src,int _ystride){ - __asm{ -#define YSTRIDE edx -#define YSTRIDE3 edi -#define RESIDUE ecx -#define SRC eax - mov YSTRIDE,_ystride - mov RESIDUE,_residue - mov SRC,_src - /*mm0=[src]*/ - movq mm0,[SRC] - /*mm1=[src+ystride]*/ - movq mm1,[SRC+YSTRIDE] - /*mm6={-1}x4*/ - pcmpeqw mm6,mm6 - /*mm2=[src+2*ystride]*/ - movq mm2,[SRC+YSTRIDE*2] - /*[ystride3]=3*[ystride]*/ - lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] - /*mm6={1}x4*/ - psllw mm6,15 - /*mm3=[src+3*ystride]*/ - movq mm3,[SRC+YSTRIDE3] - /*mm6={128}x4*/ - psrlw mm6,8 - /*mm7=0*/ - pxor mm7,mm7 - /*[src]=[src]+4*[ystride]*/ - lea SRC,[SRC+YSTRIDE*4] - /*Compute [src]-128 and [src+ystride]-128*/ - movq mm4,mm0 - punpcklbw mm0,mm7 - movq mm5,mm1 - punpckhbw mm4,mm7 - psubw mm0,mm6 - punpcklbw mm1,mm7 - psubw mm4,mm6 - punpckhbw mm5,mm7 - psubw mm1,mm6 - psubw mm5,mm6 - /*Write the answer out.*/ - movq [RESIDUE+0x00],mm0 - movq [RESIDUE+0x08],mm4 - movq [RESIDUE+0x10],mm1 - movq [RESIDUE+0x18],mm5 - /*mm0=[src+4*ystride]*/ - movq mm0,[SRC] - /*mm1=[src+5*ystride]*/ - movq mm1,[SRC+YSTRIDE] - /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/ - movq mm4,mm2 - punpcklbw mm2,mm7 - movq mm5,mm3 - punpckhbw mm4,mm7 - psubw mm2,mm6 - punpcklbw mm3,mm7 - psubw mm4,mm6 - punpckhbw mm5,mm7 - psubw mm3,mm6 - psubw mm5,mm6 - /*Write the answer out.*/ - movq [RESIDUE+0x20],mm2 - movq [RESIDUE+0x28],mm4 - movq [RESIDUE+0x30],mm3 - movq [RESIDUE+0x38],mm5 - /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ - movq mm2,[SRC+YSTRIDE*2] - movq mm3,[SRC+YSTRIDE3] - movq mm4,mm0 - punpcklbw mm0,mm7 - movq mm5,mm1 - punpckhbw mm4,mm7 - psubw mm0,mm6 - punpcklbw mm1,mm7 - psubw mm4,mm6 - punpckhbw mm5,mm7 - psubw mm1,mm6 - psubw mm5,mm6 - /*Write the answer out.*/ - movq [RESIDUE+0x40],mm0 - movq [RESIDUE+0x48],mm4 - movq [RESIDUE+0x50],mm1 - movq [RESIDUE+0x58],mm5 - /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ - movq mm4,mm2 - punpcklbw mm2,mm7 - movq mm5,mm3 - punpckhbw mm4,mm7 - psubw mm2,mm6 - punpcklbw mm3,mm7 - psubw mm4,mm6 - punpckhbw mm5,mm7 - psubw mm3,mm6 - psubw mm5,mm6 - /*Write the answer out.*/ - movq [RESIDUE+0x60],mm2 - movq [RESIDUE+0x68],mm4 - movq [RESIDUE+0x70],mm3 - movq [RESIDUE+0x78],mm5 -#undef YSTRIDE -#undef YSTRIDE3 -#undef RESIDUE -#undef SRC - } -} - -void oc_enc_frag_copy2_mmxext(unsigned char *_dst, - const unsigned char *_src1,const unsigned char *_src2,int _ystride){ - oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride); -} - -#endif +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ + + ********************************************************************/ +#include +#include "x86enc.h" + +#if defined(OC_X86_ASM) + +unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride){ + ptrdiff_t ret; + __asm{ +#define SRC esi +#define REF edx +#define YSTRIDE ecx +#define YSTRIDE3 edi + mov YSTRIDE,_ystride + mov SRC,_src + mov REF,_ref + /*Load the first 4 rows of each block.*/ + movq mm0,[SRC] + movq mm1,[REF] + movq mm2,[SRC][YSTRIDE] + movq mm3,[REF][YSTRIDE] + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] + movq mm4,[SRC+YSTRIDE*2] + movq mm5,[REF+YSTRIDE*2] + movq mm6,[SRC+YSTRIDE3] + movq mm7,[REF+YSTRIDE3] + /*Compute their SADs and add them in mm0*/ + psadbw mm0,mm1 + psadbw mm2,mm3 + lea SRC,[SRC+YSTRIDE*4] + paddw mm0,mm2 + lea REF,[REF+YSTRIDE*4] + /*Load the next 3 rows as registers become available.*/ + movq mm2,[SRC] + movq mm3,[REF] + psadbw mm4,mm5 + psadbw mm6,mm7 + paddw mm0,mm4 + movq mm5,[REF+YSTRIDE] + movq mm4,[SRC+YSTRIDE] + paddw mm0,mm6 + movq mm7,[REF+YSTRIDE*2] + movq mm6,[SRC+YSTRIDE*2] + /*Start adding their SADs to mm0*/ + psadbw mm2,mm3 + psadbw mm4,mm5 + paddw mm0,mm2 + psadbw mm6,mm7 + /*Load last row as registers become available.*/ + movq mm2,[SRC+YSTRIDE3] + movq mm3,[REF+YSTRIDE3] + /*And finish adding up their SADs.*/ + paddw mm0,mm4 + psadbw mm2,mm3 + paddw mm0,mm6 + paddw mm0,mm2 + movd [ret],mm0 +#undef SRC +#undef REF +#undef YSTRIDE +#undef YSTRIDE3 + } + return (unsigned)ret; +} + +unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _thresh){ + /*Early termination is for suckers.*/ + return oc_enc_frag_sad_mmxext(_src,_ref,_ystride); +} + +#define OC_SAD2_LOOP __asm{ \ + /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \ + pavgb computes (mm0+mm1+1>>1). \ + The latter is exactly 1 too large when the low bit of two corresponding \ + bytes is only set in one of them. \ + Therefore we pxor the operands, pand to mask out the low bits, and psubb to \ + correct the output of pavgb.*/ \ + __asm movq mm6,mm0 \ + __asm lea REF1,[REF1+YSTRIDE*2] \ + __asm pxor mm0,mm1 \ + __asm pavgb mm6,mm1 \ + __asm lea REF2,[REF2+YSTRIDE*2] \ + __asm movq mm1,mm2 \ + __asm pand mm0,mm7 \ + __asm pavgb mm2,mm3 \ + __asm pxor mm1,mm3 \ + __asm movq mm3,[REF2+YSTRIDE] \ + __asm psubb mm6,mm0 \ + __asm movq mm0,[REF1] \ + __asm pand mm1,mm7 \ + __asm psadbw mm4,mm6 \ + __asm movd mm6,RET \ + __asm psubb mm2,mm1 \ + __asm movq mm1,[REF2] \ + __asm lea SRC,[SRC+YSTRIDE*2] \ + __asm psadbw mm5,mm2 \ + __asm movq mm2,[REF1+YSTRIDE] \ + __asm paddw mm5,mm4 \ + __asm movq mm4,[SRC] \ + __asm paddw mm6,mm5 \ + __asm movq mm5,[SRC+YSTRIDE] \ + __asm movd RET,mm6 \ +} + +/*Same as above, but does not pre-load the next two rows.*/ +#define OC_SAD2_TAIL __asm{ \ + __asm movq mm6,mm0 \ + __asm pavgb mm0,mm1 \ + __asm pxor mm6,mm1 \ + __asm movq mm1,mm2 \ + __asm pand mm6,mm7 \ + __asm pavgb mm2,mm3 \ + __asm pxor mm1,mm3 \ + __asm psubb mm0,mm6 \ + __asm pand mm1,mm7 \ + __asm psadbw mm4,mm0 \ + __asm psubb mm2,mm1 \ + __asm movd mm6,RET \ + __asm psadbw mm5,mm2 \ + __asm paddw mm5,mm4 \ + __asm paddw mm6,mm5 \ + __asm movd RET,mm6 \ +} + +unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, + unsigned _thresh){ + ptrdiff_t ret; + __asm{ +#define REF1 ecx +#define REF2 edi +#define YSTRIDE esi +#define SRC edx +#define RET eax + mov YSTRIDE,_ystride + mov SRC,_src + mov REF1,_ref1 + mov REF2,_ref2 + movq mm0,[REF1] + movq mm1,[REF2] + movq mm2,[REF1+YSTRIDE] + movq mm3,[REF2+YSTRIDE] + xor RET,RET + movq mm4,[SRC] + pxor mm7,mm7 + pcmpeqb mm6,mm6 + movq mm5,[SRC+YSTRIDE] + psubb mm7,mm6 + OC_SAD2_LOOP + OC_SAD2_LOOP + OC_SAD2_LOOP + OC_SAD2_TAIL + mov [ret],RET +#undef REF1 +#undef REF2 +#undef YSTRIDE +#undef SRC +#undef RET + } + return (unsigned)ret; +} + +/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their + 16-bit difference in mm0...mm7.*/ +#define OC_LOAD_SUB_8x4(_off) __asm{ \ + __asm movd mm0,[_off+SRC] \ + __asm movd mm4,[_off+REF] \ + __asm movd mm1,[_off+SRC+SRC_YSTRIDE] \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm movd mm5,[_off+REF+REF_YSTRIDE] \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm movd mm2,[_off+SRC] \ + __asm movd mm7,[_off+REF] \ + __asm movd mm3,[_off+SRC+SRC_YSTRIDE] \ + __asm movd mm6,[_off+REF+REF_YSTRIDE] \ + __asm punpcklbw mm0,mm4 \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm punpcklbw mm4,mm4 \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm psubw mm0,mm4 \ + __asm movd mm4,[_off+SRC] \ + __asm movq [_off*2+BUF],mm0 \ + __asm movd mm0,[_off+REF] \ + __asm punpcklbw mm1,mm5 \ + __asm punpcklbw mm5,mm5 \ + __asm psubw mm1,mm5 \ + __asm movd mm5,[_off+SRC+SRC_YSTRIDE] \ + __asm punpcklbw mm2,mm7 \ + __asm punpcklbw mm7,mm7 \ + __asm psubw mm2,mm7 \ + __asm movd mm7,[_off+REF+REF_YSTRIDE] \ + __asm punpcklbw mm3,mm6 \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm punpcklbw mm6,mm6 \ + __asm psubw mm3,mm6 \ + __asm movd mm6,[_off+SRC] \ + __asm punpcklbw mm4,mm0 \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm punpcklbw mm0,mm0 \ + __asm lea SRC,[SRC+SRC_YSTRIDE*2] \ + __asm psubw mm4,mm0 \ + __asm movd mm0,[_off+REF] \ + __asm punpcklbw mm5,mm7 \ + __asm neg SRC_YSTRIDE \ + __asm punpcklbw mm7,mm7 \ + __asm psubw mm5,mm7 \ + __asm movd mm7,[_off+SRC+SRC_YSTRIDE] \ + __asm punpcklbw mm6,mm0 \ + __asm lea REF,[REF+REF_YSTRIDE*2] \ + __asm punpcklbw mm0,mm0 \ + __asm neg REF_YSTRIDE \ + __asm psubw mm6,mm0 \ + __asm movd mm0,[_off+REF+REF_YSTRIDE] \ + __asm lea SRC,[SRC+SRC_YSTRIDE*8] \ + __asm punpcklbw mm7,mm0 \ + __asm neg SRC_YSTRIDE \ + __asm punpcklbw mm0,mm0 \ + __asm lea REF,[REF+REF_YSTRIDE*8] \ + __asm psubw mm7,mm0 \ + __asm neg REF_YSTRIDE \ + __asm movq mm0,[_off*2+BUF] \ +} + +/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/ +#define OC_LOAD_8x4(_off) __asm{ \ + __asm movd mm0,[_off+SRC] \ + __asm movd mm1,[_off+SRC+YSTRIDE] \ + __asm movd mm2,[_off+SRC+YSTRIDE*2] \ + __asm pxor mm7,mm7 \ + __asm movd mm3,[_off+SRC+YSTRIDE3] \ + __asm punpcklbw mm0,mm7 \ + __asm movd mm4,[_off+SRC4] \ + __asm punpcklbw mm1,mm7 \ + __asm movd mm5,[_off+SRC4+YSTRIDE] \ + __asm punpcklbw mm2,mm7 \ + __asm movd mm6,[_off+SRC4+YSTRIDE*2] \ + __asm punpcklbw mm3,mm7 \ + __asm movd mm7,[_off+SRC4+YSTRIDE3] \ + __asm punpcklbw mm4,mm4 \ + __asm punpcklbw mm5,mm5 \ + __asm psrlw mm4,8 \ + __asm psrlw mm5,8 \ + __asm punpcklbw mm6,mm6 \ + __asm punpcklbw mm7,mm7 \ + __asm psrlw mm6,8 \ + __asm psrlw mm7,8 \ +} + +/*Performs the first two stages of an 8-point 1-D Hadamard transform. + The transform is performed in place, except that outputs 0-3 are swapped with + outputs 4-7. + Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to + perform this stage in place with no temporary registers).*/ +#define OC_HADAMARD_AB_8x4 __asm{ \ + /*Stage A: \ + Outputs 0-3 are swapped with 4-7 here.*/ \ + __asm paddw mm5,mm1 \ + __asm paddw mm6,mm2 \ + __asm paddw mm1,mm1 \ + __asm paddw mm2,mm2 \ + __asm psubw mm1,mm5 \ + __asm psubw mm2,mm6 \ + __asm paddw mm7,mm3 \ + __asm paddw mm4,mm0 \ + __asm paddw mm3,mm3 \ + __asm paddw mm0,mm0 \ + __asm psubw mm3,mm7 \ + __asm psubw mm0,mm4 \ + /*Stage B:*/ \ + __asm paddw mm0,mm2 \ + __asm paddw mm1,mm3 \ + __asm paddw mm4,mm6 \ + __asm paddw mm5,mm7 \ + __asm paddw mm2,mm2 \ + __asm paddw mm3,mm3 \ + __asm paddw mm6,mm6 \ + __asm paddw mm7,mm7 \ + __asm psubw mm2,mm0 \ + __asm psubw mm3,mm1 \ + __asm psubw mm6,mm4 \ + __asm psubw mm7,mm5 \ +} + +/*Performs the last stage of an 8-point 1-D Hadamard transform in place. + Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in + place with no temporary registers).*/ +#define OC_HADAMARD_C_8x4 __asm{ \ + /*Stage C:*/ \ + __asm paddw mm0,mm1 \ + __asm paddw mm2,mm3 \ + __asm paddw mm4,mm5 \ + __asm paddw mm6,mm7 \ + __asm paddw mm1,mm1 \ + __asm paddw mm3,mm3 \ + __asm paddw mm5,mm5 \ + __asm paddw mm7,mm7 \ + __asm psubw mm1,mm0 \ + __asm psubw mm3,mm2 \ + __asm psubw mm5,mm4 \ + __asm psubw mm7,mm6 \ +} + +/*Performs an 8-point 1-D Hadamard transform. + The transform is performed in place, except that outputs 0-3 are swapped with + outputs 4-7. + Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform + in place with no temporary registers).*/ +#define OC_HADAMARD_8x4 __asm{ \ + OC_HADAMARD_AB_8x4 \ + OC_HADAMARD_C_8x4 \ +} + +/*Performs the first part of the final stage of the Hadamard transform and + summing of absolute values. + At the end of this part, mm1 will contain the DC coefficient of the + transform.*/ +#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \ + /*We use the fact that \ + (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \ + to merge the final butterfly with the abs and the first stage of \ + accumulation. \ + Thus we can avoid using pabsw, which is not available until SSSE3. \ + Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \ + implementation would be (3+3)*8+7=55 instructions (+4 for spilling \ + registers). \ + Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ + This implementation is only 26 (+4 for spilling registers).*/ \ + __asm movq [_r7+BUF],mm7 \ + __asm movq [_r6+BUF],mm6 \ + /*mm7={0x7FFF}x4 \ + mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \ + __asm pcmpeqb mm7,mm7 \ + __asm movq mm6,mm0 \ + __asm psrlw mm7,1 \ + __asm paddw mm6,mm1 \ + __asm pmaxsw mm0,mm1 \ + __asm paddsw mm6,mm7 \ + __asm psubw mm0,mm6 \ + /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \ + mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \ + __asm movq mm6,mm2 \ + __asm movq mm1,mm4 \ + __asm pmaxsw mm2,mm3 \ + __asm pmaxsw mm4,mm5 \ + __asm paddw mm6,mm3 \ + __asm paddw mm1,mm5 \ + __asm movq mm3,[_r7+BUF] \ +} + +/*Performs the second part of the final stage of the Hadamard transform and + summing of absolute values.*/ +#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \ + __asm paddsw mm6,mm7 \ + __asm movq mm5,[_r6+BUF] \ + __asm paddsw mm1,mm7 \ + __asm psubw mm2,mm6 \ + __asm psubw mm4,mm1 \ + /*mm7={1}x4 (needed for the horizontal add that follows) \ + mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \ + __asm movq mm6,mm3 \ + __asm pmaxsw mm3,mm5 \ + __asm paddw mm0,mm2 \ + __asm paddw mm6,mm5 \ + __asm paddw mm0,mm4 \ + __asm paddsw mm6,mm7 \ + __asm paddw mm0,mm3 \ + __asm psrlw mm7,14 \ + __asm psubw mm0,mm6 \ +} + +/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the + absolute value of each component, and accumulates everything into mm0. + This is the only portion of SATD which requires MMXEXT (we could use plain + MMX, but it takes 4 instructions and an extra register to work around the + lack of a pmaxsw, which is a pretty serious penalty).*/ +#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \ + OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \ + OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ +} + +/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each + component, and accumulates everything into mm0. + Note that mm0 will have an extra 4 added to each column, and that after + removing this value, the remainder will be half the conventional value.*/ +#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \ + OC_HADAMARD_AB_8x4 \ + OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \ +} + +/*Performs two 4x4 transposes (mostly) in place. + On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7} + contains rows {a,b,c,d}. + On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and + {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/ +#define OC_TRANSPOSE_4x4x2(_off) __asm{ \ + /*First 4x4 transpose:*/ \ + __asm movq [0x10+_off+BUF],mm5 \ + /*mm0 = e3 e2 e1 e0 \ + mm1 = f3 f2 f1 f0 \ + mm2 = g3 g2 g1 g0 \ + mm3 = h3 h2 h1 h0*/ \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm2,mm3 \ + __asm punpckhwd mm5,mm3 \ + __asm movq mm3,mm0 \ + __asm punpcklwd mm0,mm1 \ + __asm punpckhwd mm3,mm1 \ + /*mm0 = f1 e1 f0 e0 \ + mm3 = f3 e3 f2 e2 \ + mm2 = h1 g1 h0 g0 \ + mm5 = h3 g3 h2 g2*/ \ + __asm movq mm1,mm0 \ + __asm punpckldq mm0,mm2 \ + __asm punpckhdq mm1,mm2 \ + __asm movq mm2,mm3 \ + __asm punpckhdq mm3,mm5 \ + __asm movq [0x40+_off+BUF],mm0 \ + __asm punpckldq mm2,mm5 \ + /*mm0 = h0 g0 f0 e0 \ + mm1 = h1 g1 f1 e1 \ + mm2 = h2 g2 f2 e2 \ + mm3 = h3 g3 f3 e3*/ \ + __asm movq mm5,[0x10+_off+BUF] \ + /*Second 4x4 transpose:*/ \ + /*mm4 = a3 a2 a1 a0 \ + mm5 = b3 b2 b1 b0 \ + mm6 = c3 c2 c1 c0 \ + mm7 = d3 d2 d1 d0*/ \ + __asm movq mm0,mm6 \ + __asm punpcklwd mm6,mm7 \ + __asm movq [0x50+_off+BUF],mm1 \ + __asm punpckhwd mm0,mm7 \ + __asm movq mm7,mm4 \ + __asm punpcklwd mm4,mm5 \ + __asm movq [0x60+_off+BUF],mm2 \ + __asm punpckhwd mm7,mm5 \ + /*mm4 = b1 a1 b0 a0 \ + mm7 = b3 a3 b2 a2 \ + mm6 = d1 c1 d0 c0 \ + mm0 = d3 c3 d2 c2*/ \ + __asm movq mm5,mm4 \ + __asm punpckldq mm4,mm6 \ + __asm movq [0x70+_off+BUF],mm3 \ + __asm punpckhdq mm5,mm6 \ + __asm movq mm6,mm7 \ + __asm punpckhdq mm7,mm0 \ + __asm punpckldq mm6,mm0 \ + /*mm4 = d0 c0 b0 a0 \ + mm5 = d1 c1 b1 a1 \ + mm6 = d2 c2 b2 a2 \ + mm7 = d3 c3 b3 a3*/ \ +} + +static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, + int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){ + OC_ALIGN8(ogg_int16_t buf[64]); + ogg_int16_t *bufp; + unsigned ret1; + unsigned ret2; + bufp=buf; + __asm{ +#define SRC esi +#define REF eax +#define SRC_YSTRIDE ecx +#define REF_YSTRIDE edx +#define BUF edi +#define RET eax +#define RET2 edx + mov SRC,_src + mov SRC_YSTRIDE,_src_ystride + mov REF,_ref + mov REF_YSTRIDE,_ref_ystride + mov BUF,bufp + OC_LOAD_SUB_8x4(0x00) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x00) + /*Finish swapping out this 8x4 block to make room for the next one. + mm0...mm3 have been swapped out already.*/ + movq [0x00+BUF],mm4 + movq [0x10+BUF],mm5 + movq [0x20+BUF],mm6 + movq [0x30+BUF],mm7 + OC_LOAD_SUB_8x4(0x04) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x08) + /*Here the first 4x4 block of output from the last transpose is the second + 4x4 block of input for the next transform. + We have cleverly arranged that it already be in the appropriate place, so + we only have to do half the loads.*/ + movq mm1,[0x10+BUF] + movq mm2,[0x20+BUF] + movq mm3,[0x30+BUF] + movq mm0,[0x00+BUF] + OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38) + /*Up to this point, everything fit in 16 bits (8 input + 1 for the + difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 + for the factor of two we dropped + 3 for the vertical accumulation). + Now we finally have to promote things to dwords. + We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long + latency of pmaddwd by starting the next series of loads now.*/ + mov RET2,_thresh + pmaddwd mm0,mm7 + movq mm1,[0x50+BUF] + movq mm5,[0x58+BUF] + movq mm4,mm0 + movq mm2,[0x60+BUF] + punpckhdq mm0,mm0 + movq mm6,[0x68+BUF] + paddd mm4,mm0 + movq mm3,[0x70+BUF] + movd RET,mm4 + movq mm7,[0x78+BUF] + /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 + added to them, and a factor of two removed; correct the final sum here.*/ + lea RET,[RET+RET-32] + movq mm0,[0x40+BUF] + cmp RET,RET2 + movq mm4,[0x48+BUF] + jae at_end + OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) + pmaddwd mm0,mm7 + /*There isn't much to stick in here to hide the latency this time, but the + alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose + latency is even worse.*/ + sub RET,32 + movq mm4,mm0 + punpckhdq mm0,mm0 + paddd mm4,mm0 + movd RET2,mm4 + lea RET,[RET+RET2*2] + align 16 +at_end: + mov ret1,RET +#undef SRC +#undef REF +#undef SRC_YSTRIDE +#undef REF_YSTRIDE +#undef BUF +#undef RET +#undef RET2 + } + return ret1; +} + +unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _thresh){ + return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh); +} + + +/*Our internal implementation of frag_copy2 takes an extra stride parameter so + we can share code with oc_enc_frag_satd2_thresh_mmxext().*/ +static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, + const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){ + __asm{ + /*Load the first 3 rows.*/ +#define DST_YSTRIDE edi +#define SRC_YSTRIDE esi +#define DST eax +#define SRC1 edx +#define SRC2 ecx + mov DST_YSTRIDE,_dst_ystride + mov SRC_YSTRIDE,_src_ystride + mov DST,_dst + mov SRC1,_src1 + mov SRC2,_src2 + movq mm0,[SRC1] + movq mm1,[SRC2] + movq mm2,[SRC1+SRC_YSTRIDE] + lea SRC1,[SRC1+SRC_YSTRIDE*2] + movq mm3,[SRC2+SRC_YSTRIDE] + lea SRC2,[SRC2+SRC_YSTRIDE*2] + pxor mm7,mm7 + movq mm4,[SRC1] + pcmpeqb mm6,mm6 + movq mm5,[SRC2] + /*mm7={1}x8.*/ + psubb mm7,mm6 + /*Start averaging mm0 and mm1 into mm6.*/ + movq mm6,mm0 + pxor mm0,mm1 + pavgb mm6,mm1 + /*mm1 is free, start averaging mm3 into mm2 using mm1.*/ + movq mm1,mm2 + pand mm0,mm7 + pavgb mm2,mm3 + pxor mm1,mm3 + /*mm3 is free.*/ + psubb mm6,mm0 + /*mm0 is free, start loading the next row.*/ + movq mm0,[SRC1+SRC_YSTRIDE] + /*Start averaging mm5 and mm4 using mm3.*/ + movq mm3,mm4 + /*mm6 [row 0] is done; write it out.*/ + movq [DST],mm6 + pand mm1,mm7 + pavgb mm4,mm5 + psubb mm2,mm1 + /*mm1 is free, continue loading the next row.*/ + movq mm1,[SRC2+SRC_YSTRIDE] + pxor mm3,mm5 + lea SRC1,[SRC1+SRC_YSTRIDE*2] + /*mm2 [row 1] is done; write it out.*/ + movq [DST+DST_YSTRIDE],mm2 + pand mm3,mm7 + /*Start loading the next row.*/ + movq mm2,[SRC1] + lea DST,[DST+DST_YSTRIDE*2] + psubb mm4,mm3 + lea SRC2,[SRC2+SRC_YSTRIDE*2] + /*mm4 [row 2] is done; write it out.*/ + movq [DST],mm4 + /*Continue loading the next row.*/ + movq mm3,[SRC2] + /*Start averaging mm0 and mm1 into mm6.*/ + movq mm6,mm0 + pxor mm0,mm1 + /*Start loading the next row.*/ + movq mm4,[SRC1+SRC_YSTRIDE] + pavgb mm6,mm1 + /*mm1 is free; start averaging mm3 into mm2 using mm1.*/ + movq mm1,mm2 + pand mm0,mm7 + /*Continue loading the next row.*/ + movq mm5,[SRC2+SRC_YSTRIDE] + pavgb mm2,mm3 + lea SRC1,[SRC1+SRC_YSTRIDE*2] + pxor mm1,mm3 + /*mm3 is free.*/ + psubb mm6,mm0 + /*mm0 is free, start loading the next row.*/ + movq mm0,[SRC1] + /*Start averaging mm5 into mm4 using mm3.*/ + movq mm3,mm4 + /*mm6 [row 3] is done; write it out.*/ + movq [DST+DST_YSTRIDE],mm6 + pand mm1,mm7 + lea SRC2,[SRC2+SRC_YSTRIDE*2] + pavgb mm4,mm5 + lea DST,[DST+DST_YSTRIDE*2] + psubb mm2,mm1 + /*mm1 is free; continue loading the next row.*/ + movq mm1,[SRC2] + pxor mm3,mm5 + /*mm2 [row 4] is done; write it out.*/ + movq [DST],mm2 + pand mm3,mm7 + /*Start loading the next row.*/ + movq mm2,[SRC1+SRC_YSTRIDE] + psubb mm4,mm3 + /*Start averaging mm0 and mm1 into mm6.*/ + movq mm6,mm0 + /*Continue loading the next row.*/ + movq mm3,[SRC2+SRC_YSTRIDE] + /*mm4 [row 5] is done; write it out.*/ + movq [DST+DST_YSTRIDE],mm4 + pxor mm0,mm1 + pavgb mm6,mm1 + /*mm4 is free; start averaging mm3 into mm2 using mm4.*/ + movq mm4,mm2 + pand mm0,mm7 + pavgb mm2,mm3 + pxor mm4,mm3 + lea DST,[DST+DST_YSTRIDE*2] + psubb mm6,mm0 + pand mm4,mm7 + /*mm6 [row 6] is done, write it out.*/ + movq [DST],mm6 + psubb mm2,mm4 + /*mm2 [row 7] is done, write it out.*/ + movq [DST+DST_YSTRIDE],mm2 +#undef SRC1 +#undef SRC2 +#undef SRC_YSTRIDE +#undef DST_YSTRIDE +#undef DST + } +} + +unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, + unsigned _thresh){ + OC_ALIGN8(unsigned char ref[64]); + oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); + return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh); +} + +unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, + int _ystride){ + OC_ALIGN8(ogg_int16_t buf[64]); + ogg_int16_t *bufp; + unsigned ret1; + unsigned ret2; + bufp=buf; + __asm{ +#define SRC eax +#define SRC4 esi +#define BUF edi +#define RET eax +#define RET_WORD ax +#define RET2 ecx +#define YSTRIDE edx +#define YSTRIDE3 ecx + mov SRC,_src + mov BUF,bufp + mov YSTRIDE,_ystride + /* src4 = src+4*ystride */ + lea SRC4,[SRC+YSTRIDE*4] + /* ystride3 = 3*ystride */ + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] + OC_LOAD_8x4(0x00) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x00) + /*Finish swapping out this 8x4 block to make room for the next one. + mm0...mm3 have been swapped out already.*/ + movq [0x00+BUF],mm4 + movq [0x10+BUF],mm5 + movq [0x20+BUF],mm6 + movq [0x30+BUF],mm7 + OC_LOAD_8x4(0x04) + OC_HADAMARD_8x4 + OC_TRANSPOSE_4x4x2(0x08) + /*Here the first 4x4 block of output from the last transpose is the second + 4x4 block of input for the next transform. + We have cleverly arranged that it already be in the appropriate place, so + we only have to do half the loads.*/ + movq mm1,[0x10+BUF] + movq mm2,[0x20+BUF] + movq mm3,[0x30+BUF] + movq mm0,[0x00+BUF] + /*We split out the stages here so we can save the DC coefficient in the + middle.*/ + OC_HADAMARD_AB_8x4 + OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) + movd RET,mm1 + OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) + /*Up to this point, everything fit in 16 bits (8 input + 1 for the + difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 + for the factor of two we dropped + 3 for the vertical accumulation). + Now we finally have to promote things to dwords. + We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long + latency of pmaddwd by starting the next series of loads now.*/ + pmaddwd mm0,mm7 + movq mm1,[0x50+BUF] + movq mm5,[0x58+BUF] + movq mm2,[0x60+BUF] + movq mm4,mm0 + movq mm6,[0x68+BUF] + punpckhdq mm0,mm0 + movq mm3,[0x70+BUF] + paddd mm4,mm0 + movq mm7,[0x78+BUF] + movd RET2,mm4 + movq mm0,[0x40+BUF] + movq mm4,[0x48+BUF] + OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) + pmaddwd mm0,mm7 + /*We assume that the DC coefficient is always positive (which is true, + because the input to the INTRA transform was not a difference).*/ + movzx RET,RET_WORD + add RET2,RET2 + sub RET2,RET + movq mm4,mm0 + punpckhdq mm0,mm0 + paddd mm4,mm0 + movd RET,mm4 + lea RET,[-64+RET2+RET*2] + mov [ret1],RET +#undef SRC +#undef SRC4 +#undef BUF +#undef RET +#undef RET_WORD +#undef RET2 +#undef YSTRIDE +#undef YSTRIDE3 + } + return ret1; +} + +void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64], + const unsigned char *_src, const unsigned char *_ref,int _ystride){ + int i; + __asm pxor mm7,mm7 + for(i=4;i-->0;){ + __asm{ +#define SRC edx +#define YSTRIDE esi +#define RESIDUE eax +#define REF ecx + mov YSTRIDE,_ystride + mov RESIDUE,_residue + mov SRC,_src + mov REF,_ref + /*mm0=[src]*/ + movq mm0,[SRC] + /*mm1=[ref]*/ + movq mm1,[REF] + /*mm4=[src+ystride]*/ + movq mm4,[SRC+YSTRIDE] + /*mm5=[ref+ystride]*/ + movq mm5,[REF+YSTRIDE] + /*Compute [src]-[ref].*/ + movq mm2,mm0 + punpcklbw mm0,mm7 + movq mm3,mm1 + punpckhbw mm2,mm7 + punpcklbw mm1,mm7 + punpckhbw mm3,mm7 + psubw mm0,mm1 + psubw mm2,mm3 + /*Compute [src+ystride]-[ref+ystride].*/ + movq mm1,mm4 + punpcklbw mm4,mm7 + movq mm3,mm5 + punpckhbw mm1,mm7 + lea SRC,[SRC+YSTRIDE*2] + punpcklbw mm5,mm7 + lea REF,[REF+YSTRIDE*2] + punpckhbw mm3,mm7 + psubw mm4,mm5 + psubw mm1,mm3 + /*Write the answer out.*/ + movq [RESIDUE+0x00],mm0 + movq [RESIDUE+0x08],mm2 + movq [RESIDUE+0x10],mm4 + movq [RESIDUE+0x18],mm1 + lea RESIDUE,[RESIDUE+0x20] + mov _residue,RESIDUE + mov _src,SRC + mov _ref,REF +#undef SRC +#undef YSTRIDE +#undef RESIDUE +#undef REF + } + } +} + +void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64], + const unsigned char *_src,int _ystride){ + __asm{ +#define YSTRIDE edx +#define YSTRIDE3 edi +#define RESIDUE ecx +#define SRC eax + mov YSTRIDE,_ystride + mov RESIDUE,_residue + mov SRC,_src + /*mm0=[src]*/ + movq mm0,[SRC] + /*mm1=[src+ystride]*/ + movq mm1,[SRC+YSTRIDE] + /*mm6={-1}x4*/ + pcmpeqw mm6,mm6 + /*mm2=[src+2*ystride]*/ + movq mm2,[SRC+YSTRIDE*2] + /*[ystride3]=3*[ystride]*/ + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] + /*mm6={1}x4*/ + psllw mm6,15 + /*mm3=[src+3*ystride]*/ + movq mm3,[SRC+YSTRIDE3] + /*mm6={128}x4*/ + psrlw mm6,8 + /*mm7=0*/ + pxor mm7,mm7 + /*[src]=[src]+4*[ystride]*/ + lea SRC,[SRC+YSTRIDE*4] + /*Compute [src]-128 and [src+ystride]-128*/ + movq mm4,mm0 + punpcklbw mm0,mm7 + movq mm5,mm1 + punpckhbw mm4,mm7 + psubw mm0,mm6 + punpcklbw mm1,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm1,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x00],mm0 + movq [RESIDUE+0x08],mm4 + movq [RESIDUE+0x10],mm1 + movq [RESIDUE+0x18],mm5 + /*mm0=[src+4*ystride]*/ + movq mm0,[SRC] + /*mm1=[src+5*ystride]*/ + movq mm1,[SRC+YSTRIDE] + /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/ + movq mm4,mm2 + punpcklbw mm2,mm7 + movq mm5,mm3 + punpckhbw mm4,mm7 + psubw mm2,mm6 + punpcklbw mm3,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm3,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x20],mm2 + movq [RESIDUE+0x28],mm4 + movq [RESIDUE+0x30],mm3 + movq [RESIDUE+0x38],mm5 + /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ + movq mm2,[SRC+YSTRIDE*2] + movq mm3,[SRC+YSTRIDE3] + movq mm4,mm0 + punpcklbw mm0,mm7 + movq mm5,mm1 + punpckhbw mm4,mm7 + psubw mm0,mm6 + punpcklbw mm1,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm1,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x40],mm0 + movq [RESIDUE+0x48],mm4 + movq [RESIDUE+0x50],mm1 + movq [RESIDUE+0x58],mm5 + /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ + movq mm4,mm2 + punpcklbw mm2,mm7 + movq mm5,mm3 + punpckhbw mm4,mm7 + psubw mm2,mm6 + punpcklbw mm3,mm7 + psubw mm4,mm6 + punpckhbw mm5,mm7 + psubw mm3,mm6 + psubw mm5,mm6 + /*Write the answer out.*/ + movq [RESIDUE+0x60],mm2 + movq [RESIDUE+0x68],mm4 + movq [RESIDUE+0x70],mm3 + movq [RESIDUE+0x78],mm5 +#undef YSTRIDE +#undef YSTRIDE3 +#undef RESIDUE +#undef SRC + } +} + +void oc_enc_frag_copy2_mmxext(unsigned char *_dst, + const unsigned char *_src1,const unsigned char *_src2,int _ystride){ + oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride); +} + +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c index dcf17c9fa7..d908ce2413 100644 --- a/thirdparty/libtheora/x86_vc/mmxfdct.c +++ b/thirdparty/libtheora/x86_vc/mmxfdct.c @@ -1,670 +1,670 @@ -/******************************************************************** - * * - * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * - * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * - * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * - * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * - * * - * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 * - * by the Xiph.Org Foundation http://www.xiph.org/ * - * * - ********************************************************************/ - /*MMX fDCT implementation for x86_32*/ -/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/ -#include "x86enc.h" - -#if defined(OC_X86_ASM) - -#define OC_FDCT_STAGE1_8x4 __asm{ \ - /*Stage 1:*/ \ - /*mm0=t7'=t0-t7*/ \ - __asm psubw mm0,mm7 \ - __asm paddw mm7,mm7 \ - /*mm1=t6'=t1-t6*/ \ - __asm psubw mm1, mm6 \ - __asm paddw mm6,mm6 \ - /*mm2=t5'=t2-t5*/ \ - __asm psubw mm2,mm5 \ - __asm paddw mm5,mm5 \ - /*mm3=t4'=t3-t4*/ \ - __asm psubw mm3,mm4 \ - __asm paddw mm4,mm4 \ - /*mm7=t0'=t0+t7*/ \ - __asm paddw mm7,mm0 \ - /*mm6=t1'=t1+t6*/ \ - __asm paddw mm6,mm1 \ - /*mm5=t2'=t2+t5*/ \ - __asm paddw mm5,mm2 \ - /*mm4=t3'=t3+t4*/ \ - __asm paddw mm4,mm3\ -} - -#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \ - /*Stage 2:*/ \ - /*mm7=t3''=t0'-t3'*/ \ - __asm psubw mm7,mm4 \ - __asm paddw mm4,mm4 \ - /*mm6=t2''=t1'-t2'*/ \ - __asm psubw mm6,mm5 \ - __asm movq [Y+_r6],mm7 \ - __asm paddw mm5,mm5 \ - /*mm1=t5''=t6'-t5'*/ \ - __asm psubw mm1,mm2 \ - __asm movq [Y+_r2],mm6 \ - /*mm4=t0''=t0'+t3'*/ \ - __asm paddw mm4,mm7 \ - __asm paddw mm2,mm2 \ - /*mm5=t1''=t1'+t2'*/ \ - __asm movq [Y+_r0],mm4 \ - __asm paddw mm5,mm6 \ - /*mm2=t6''=t6'+t5'*/ \ - __asm paddw mm2,mm1 \ - __asm movq [Y+_r4],mm5 \ - /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \ - /*mm4, mm5, mm6, mm7 are free.*/ \ - /*Stage 3:*/ \ - /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \ - __asm mov A,0x5A806A0A \ - __asm pcmpeqb mm6,mm6 \ - __asm movd mm7,A \ - __asm psrlw mm6,15 \ - __asm punpckldq mm7,mm7 \ - __asm paddw mm6,mm6 \ - /*mm0=0, m2={-1}x4 \ - mm5:mm4=t5''*27146+0xB500*/ \ - __asm movq mm4,mm1 \ - __asm movq mm5,mm1 \ - __asm punpcklwd mm4,mm6 \ - __asm movq [Y+_r3],mm2 \ - __asm pmaddwd mm4,mm7 \ - __asm movq [Y+_r7],mm0 \ - __asm punpckhwd mm5,mm6 \ - __asm pxor mm0,mm0 \ - __asm pmaddwd mm5,mm7 \ - __asm pcmpeqb mm2,mm2 \ - /*mm2=t6'', mm1=t5''+(t5''!=0) \ - mm4=(t5''*27146+0xB500>>16)*/ \ - __asm pcmpeqw mm0,mm1 \ - __asm psrad mm4,16 \ - __asm psubw mm0,mm2 \ - __asm movq mm2, [Y+_r3] \ - __asm psrad mm5,16 \ - __asm paddw mm1,mm0 \ - __asm packssdw mm4,mm5 \ - /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \ - __asm paddw mm4,mm1 \ - __asm movq mm0, [Y+_r7] \ - __asm psraw mm4,1 \ - __asm movq mm1,mm3 \ - /*mm3=t4''=t4'+s*/ \ - __asm paddw mm3,mm4 \ - /*mm1=t5'''=t4'-s*/ \ - __asm psubw mm1,mm4 \ - /*mm1=0, mm3={-1}x4 \ - mm5:mm4=t6''*27146+0xB500*/ \ - __asm movq mm4,mm2 \ - __asm movq mm5,mm2 \ - __asm punpcklwd mm4,mm6 \ - __asm movq [Y+_r5],mm1 \ - __asm pmaddwd mm4,mm7 \ - __asm movq [Y+_r1],mm3 \ - __asm punpckhwd mm5,mm6 \ - __asm pxor mm1,mm1 \ - __asm pmaddwd mm5,mm7 \ - __asm pcmpeqb mm3,mm3 \ - /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \ - __asm psrad mm4,16 \ - __asm pcmpeqw mm1,mm2 \ - __asm psrad mm5,16 \ - __asm psubw mm1,mm3 \ - __asm packssdw mm4,mm5 \ - __asm paddw mm2,mm1 \ - /*mm1=t1'' \ - mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \ - __asm paddw mm4,mm2 \ - __asm movq mm1,[Y+_r4] \ - __asm psraw mm4,1 \ - __asm movq mm2,mm0 \ - /*mm7={54491-0x7FFF,0x7FFF}x2 \ - mm0=t7''=t7'+s*/ \ - __asm paddw mm0,mm4 \ - /*mm2=t6'''=t7'-s*/ \ - __asm psubw mm2,mm4 \ - /*Stage 4:*/ \ - /*mm0=0, mm2=t0'' \ - mm5:mm4=t1''*27146+0xB500*/ \ - __asm movq mm4,mm1 \ - __asm movq mm5,mm1 \ - __asm punpcklwd mm4,mm6 \ - __asm movq [Y+_r3],mm2 \ - __asm pmaddwd mm4,mm7 \ - __asm movq mm2,[Y+_r0] \ - __asm punpckhwd mm5,mm6 \ - __asm movq [Y+_r7],mm0 \ - __asm pmaddwd mm5,mm7 \ - __asm pxor mm0,mm0 \ - /*mm7={27146,0x4000>>1}x2 \ - mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \ - __asm psrad mm4,16 \ - __asm mov A,0x20006A0A \ - __asm pcmpeqw mm0,mm1 \ - __asm movd mm7,A \ - __asm psrad mm5,16 \ - __asm psubw mm0,mm3 \ - __asm packssdw mm4,mm5 \ - __asm paddw mm0,mm1 \ - __asm punpckldq mm7,mm7 \ - __asm paddw mm0,mm4 \ - /*mm6={0x00000E3D}x2 \ - mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \ - __asm movq mm4,mm2 \ - __asm movq mm5,mm2 \ - __asm punpcklwd mm4,mm6 \ - __asm mov A,0x0E3D \ - __asm pmaddwd mm4,mm7 \ - __asm punpckhwd mm5,mm6 \ - __asm movd mm6,A \ - __asm pmaddwd mm5,mm7 \ - __asm pxor mm1,mm1 \ - __asm punpckldq mm6,mm6 \ - __asm pcmpeqw mm1,mm2 \ - /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \ - __asm psrad mm4,16 \ - __asm psubw mm1,mm3 \ - __asm psrad mm5,16 \ - __asm paddw mm2,mm1 \ - __asm packssdw mm4,mm5 \ - __asm movq mm1,[Y+_r5] \ - __asm paddw mm4,mm2 \ - /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \ - The naive implementation could cause overflow, so we use \ - u=(r&s)+((r^s)>>1).*/ \ - __asm movq mm2,[Y+_r3] \ - __asm movq mm7,mm0 \ - __asm pxor mm0,mm4 \ - __asm pand mm7,mm4 \ - __asm psraw mm0,1 \ - __asm mov A,0x7FFF54DC \ - __asm paddw mm0,mm7 \ - __asm movd mm7,A \ - /*mm7={54491-0x7FFF,0x7FFF}x2 \ - mm4=_y[4]=v=r-u*/ \ - __asm psubw mm4,mm0 \ - __asm punpckldq mm7,mm7 \ - __asm movq [Y+_r4],mm4 \ - /*mm0=0, mm7={36410}x4 \ - mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \ - __asm movq mm4,mm1 \ - __asm movq mm5,mm1 \ - __asm punpcklwd mm4,mm1 \ - __asm mov A,0x8E3A8E3A \ - __asm pmaddwd mm4,mm7 \ - __asm movq [Y+_r0],mm0 \ - __asm punpckhwd mm5,mm1 \ - __asm pxor mm0,mm0 \ - __asm pmaddwd mm5,mm7 \ - __asm pcmpeqw mm1,mm0 \ - __asm movd mm7,A \ - __asm psubw mm1,mm3 \ - __asm punpckldq mm7,mm7 \ - __asm paddd mm4,mm6 \ - __asm paddd mm5,mm6 \ - /*mm0=0 \ - mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \ - __asm movq mm6,mm2 \ - __asm movq mm3,mm2 \ - __asm pmulhw mm6,mm7 \ - __asm paddw mm1,mm2 \ - __asm pmullw mm3,mm7 \ - __asm pxor mm0,mm0 \ - __asm paddw mm6,mm1 \ - __asm movq mm1,mm3 \ - __asm punpckhwd mm3,mm6 \ - __asm punpcklwd mm1,mm6 \ - /*mm3={-1}x4, mm6={1}x4 \ - mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \ - __asm paddd mm5,mm3 \ - __asm paddd mm4,mm1 \ - __asm psrad mm5,16 \ - __asm pxor mm6,mm6 \ - __asm psrad mm4,16 \ - __asm pcmpeqb mm3,mm3 \ - __asm packssdw mm4,mm5 \ - __asm psubw mm6,mm3 \ - /*mm1=t7'', mm7={26568,0x3400}x2 \ - mm2=s=t6'''-(36410*u>>16)*/ \ - __asm movq mm1,mm4 \ - __asm mov A,0x340067C8 \ - __asm pmulhw mm4,mm7 \ - __asm movd mm7,A \ - __asm movq [Y+_r5],mm1 \ - __asm punpckldq mm7,mm7 \ - __asm paddw mm4,mm1 \ - __asm movq mm1,[Y+_r7] \ - __asm psubw mm2,mm4 \ - /*mm6={0x00007B1B}x2 \ - mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \ - __asm movq mm4,mm2 \ - __asm movq mm5,mm2 \ - __asm punpcklwd mm4,mm6 \ - __asm pcmpeqw mm0,mm2 \ - __asm pmaddwd mm4,mm7 \ - __asm mov A,0x7B1B \ - __asm punpckhwd mm5,mm6 \ - __asm movd mm6,A \ - __asm pmaddwd mm5,mm7 \ - __asm psubw mm0,mm3 \ - __asm punpckldq mm6,mm6 \ - /*mm7={64277-0x7FFF,0x7FFF}x2 \ - mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \ - __asm psrad mm4,17 \ - __asm paddw mm2,mm0 \ - __asm psrad mm5,17 \ - __asm mov A,0x7FFF7B16 \ - __asm packssdw mm4,mm5 \ - __asm movd mm7,A \ - __asm paddw mm2,mm4 \ - __asm punpckldq mm7,mm7 \ - /*mm0=0, mm7={12785}x4 \ - mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \ - __asm movq mm4,mm1 \ - __asm movq mm5,mm1 \ - __asm movq [Y+_r3],mm2 \ - __asm punpcklwd mm4,mm1 \ - __asm movq mm2,[Y+_r1] \ - __asm pmaddwd mm4,mm7 \ - __asm mov A,0x31F131F1 \ - __asm punpckhwd mm5,mm1 \ - __asm pxor mm0,mm0 \ - __asm pmaddwd mm5,mm7 \ - __asm pcmpeqw mm1,mm0 \ - __asm movd mm7,A \ - __asm psubw mm1,mm3 \ - __asm punpckldq mm7,mm7 \ - __asm paddd mm4,mm6 \ - __asm paddd mm5,mm6 \ - /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \ - __asm movq mm6,mm2 \ - __asm movq mm3,mm2 \ - __asm pmulhw mm6,mm7 \ - __asm pmullw mm3,mm7 \ - __asm paddw mm6,mm1 \ - __asm movq mm1,mm3 \ - __asm punpckhwd mm3,mm6 \ - __asm punpcklwd mm1,mm6 \ - /*mm3={-1}x4, mm6={1}x4 \ - mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \ - __asm paddd mm5,mm3 \ - __asm paddd mm4,mm1 \ - __asm psrad mm5,16 \ - __asm pxor mm6,mm6 \ - __asm psrad mm4,16 \ - __asm pcmpeqb mm3,mm3 \ - __asm packssdw mm4,mm5 \ - __asm psubw mm6,mm3 \ - /*mm1=t3'', mm7={20539,0x3000}x2 \ - mm4=s=(12785*u>>16)-t4''*/ \ - __asm movq [Y+_r1],mm4 \ - __asm pmulhw mm4,mm7 \ - __asm mov A,0x3000503B \ - __asm movq mm1,[Y+_r6] \ - __asm movd mm7,A \ - __asm psubw mm4,mm2 \ - __asm punpckldq mm7,mm7 \ - /*mm6={0x00006CB7}x2 \ - mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \ - __asm movq mm5,mm4 \ - __asm movq mm2,mm4 \ - __asm punpcklwd mm4,mm6 \ - __asm pcmpeqw mm0,mm2 \ - __asm pmaddwd mm4,mm7 \ - __asm mov A,0x6CB7 \ - __asm punpckhwd mm5,mm6 \ - __asm movd mm6,A \ - __asm pmaddwd mm5,mm7 \ - __asm psubw mm0,mm3 \ - __asm punpckldq mm6,mm6 \ - /*mm7={60547-0x7FFF,0x7FFF}x2 \ - mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \ - __asm psrad mm4,20 \ - __asm paddw mm2,mm0 \ - __asm psrad mm5,20 \ - __asm mov A,0x7FFF6C84 \ - __asm packssdw mm4,mm5 \ - __asm movd mm7,A \ - __asm paddw mm2,mm4 \ - __asm punpckldq mm7,mm7 \ - /*mm0=0, mm7={25080}x4 \ - mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \ - __asm movq mm4,mm1 \ - __asm movq mm5,mm1 \ - __asm movq [Y+_r7],mm2 \ - __asm punpcklwd mm4,mm1 \ - __asm movq mm2,[Y+_r2] \ - __asm pmaddwd mm4,mm7 \ - __asm mov A,0x61F861F8 \ - __asm punpckhwd mm5,mm1 \ - __asm pxor mm0,mm0 \ - __asm pmaddwd mm5,mm7 \ - __asm movd mm7,A \ - __asm pcmpeqw mm1,mm0 \ - __asm psubw mm1,mm3 \ - __asm punpckldq mm7,mm7 \ - __asm paddd mm4,mm6 \ - __asm paddd mm5,mm6 \ - /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \ - __asm movq mm6,mm2 \ - __asm movq mm3,mm2 \ - __asm pmulhw mm6,mm7 \ - __asm pmullw mm3,mm7 \ - __asm paddw mm6,mm1 \ - __asm movq mm1,mm3 \ - __asm punpckhwd mm3,mm6 \ - __asm punpcklwd mm1,mm6 \ - /*mm1={-1}x4 \ - mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \ - __asm paddd mm5,mm3 \ - __asm paddd mm4,mm1 \ - __asm psrad mm5,16 \ - __asm mov A,0x28005460 \ - __asm psrad mm4,16 \ - __asm pcmpeqb mm1,mm1 \ - __asm packssdw mm4,mm5 \ - /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \ - mm4=s=(25080*u>>16)-t2''*/ \ - __asm movq mm6,mm4 \ - __asm pmulhw mm4,mm7 \ - __asm pxor mm5,mm5 \ - __asm movd mm7,A \ - __asm psubw mm5,mm1 \ - __asm punpckldq mm7,mm7 \ - __asm psubw mm4,mm2 \ - /*mm2=s+(s!=0) \ - mm4:mm3=s*21600+0x2800*/ \ - __asm movq mm3,mm4 \ - __asm movq mm2,mm4 \ - __asm punpckhwd mm4,mm5 \ - __asm pcmpeqw mm0,mm2 \ - __asm pmaddwd mm4,mm7 \ - __asm psubw mm0,mm1 \ - __asm punpcklwd mm3,mm5 \ - __asm paddw mm2,mm0 \ - __asm pmaddwd mm3,mm7 \ - /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \ - mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \ - __asm movq mm0,[Y+_r4] \ - __asm psrad mm4,18 \ - __asm movq mm5,[Y+_r5] \ - __asm psrad mm3,18 \ - __asm movq mm1,[Y+_r7] \ - __asm packssdw mm3,mm4 \ - __asm movq mm4,[Y+_r0] \ - __asm paddw mm3,mm2 \ -} - -/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7]. - On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and - {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/ -#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \ - /*First 4x4 transpose:*/ \ - /*mm0 = e3 e2 e1 e0 \ - mm5 = f3 f2 f1 f0 \ - mm3 = g3 g2 g1 g0 \ - mm1 = h3 h2 h1 h0*/ \ - __asm movq mm2,mm0 \ - __asm punpcklwd mm0,mm5 \ - __asm punpckhwd mm2,mm5 \ - __asm movq mm5,mm3 \ - __asm punpcklwd mm3,mm1 \ - __asm punpckhwd mm5,mm1 \ - /*mm0 = f1 e1 f0 e0 \ - mm2 = f3 e3 f2 e2 \ - mm3 = h1 g1 h0 g0 \ - mm5 = h3 g3 h2 g2*/ \ - __asm movq mm1,mm0 \ - __asm punpckldq mm0,mm3 \ - __asm movq [Y+_r4],mm0 \ - __asm punpckhdq mm1,mm3 \ - __asm movq mm0,[Y+_r1] \ - __asm movq mm3,mm2 \ - __asm punpckldq mm2,mm5 \ - __asm punpckhdq mm3,mm5 \ - __asm movq mm5,[Y+_r3] \ - /*_y[4] = h0 g0 f0 e0 \ - mm1 = h1 g1 f1 e1 \ - mm2 = h2 g2 f2 e2 \ - mm3 = h3 g3 f3 e3*/ \ - /*Second 4x4 transpose:*/ \ - /*mm4 = a3 a2 a1 a0 \ - mm0 = b3 b2 b1 b0 \ - mm6 = c3 c2 c1 c0 \ - mm5 = d3 d2 d1 d0*/ \ - __asm movq mm7,mm4 \ - __asm punpcklwd mm4,mm0 \ - __asm punpckhwd mm7,mm0 \ - __asm movq mm0,mm6 \ - __asm punpcklwd mm6,mm5 \ - __asm punpckhwd mm0,mm5 \ - /*mm4 = b1 a1 b0 a0 \ - mm7 = b3 a3 b2 a2 \ - mm6 = d1 c1 d0 c0 \ - mm0 = d3 c3 d2 c2*/ \ - __asm movq mm5,mm4 \ - __asm punpckldq mm4,mm6 \ - __asm punpckhdq mm5,mm6 \ - __asm movq mm6,mm7 \ - __asm punpckhdq mm7,mm0 \ - __asm punpckldq mm6,mm0 \ - /*mm4 = d0 c0 b0 a0 \ - mm5 = d1 c1 b1 a1 \ - mm6 = d2 c2 b2 a2 \ - mm7 = d3 c3 b3 a3*/ \ -} - -/*MMX implementation of the fDCT.*/ -void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ - ptrdiff_t a; - __asm{ -#define Y eax -#define A ecx -#define X edx - /*Add two extra bits of working precision to improve accuracy; any more and - we could overflow.*/ - /*We also add biases to correct for some systematic error that remains in - the full fDCT->iDCT round trip.*/ - mov X, _x - mov Y, _y - movq mm0,[0x00+X] - movq mm1,[0x10+X] - movq mm2,[0x20+X] - movq mm3,[0x30+X] - pcmpeqb mm4,mm4 - pxor mm7,mm7 - movq mm5,mm0 - psllw mm0,2 - pcmpeqw mm5,mm7 - movq mm7,[0x70+X] - psllw mm1,2 - psubw mm5,mm4 - psllw mm2,2 - mov A,1 - pslld mm5,16 - movd mm6,A - psllq mm5,16 - mov A,0x10001 - psllw mm3,2 - movd mm4,A - punpckhwd mm5,mm6 - psubw mm1,mm6 - movq mm6,[0x60+X] - paddw mm0,mm5 - movq mm5,[0x50+X] - paddw mm0,mm4 - movq mm4,[0x40+X] - /*We inline stage1 of the transform here so we can get better instruction - scheduling with the shifts.*/ - /*mm0=t7'=t0-t7*/ - psllw mm7,2 - psubw mm0,mm7 - psllw mm6,2 - paddw mm7,mm7 - /*mm1=t6'=t1-t6*/ - psllw mm5,2 - psubw mm1,mm6 - psllw mm4,2 - paddw mm6,mm6 - /*mm2=t5'=t2-t5*/ - psubw mm2,mm5 - paddw mm5,mm5 - /*mm3=t4'=t3-t4*/ - psubw mm3,mm4 - paddw mm4,mm4 - /*mm7=t0'=t0+t7*/ - paddw mm7,mm0 - /*mm6=t1'=t1+t6*/ - paddw mm6,mm1 - /*mm5=t2'=t2+t5*/ - paddw mm5,mm2 - /*mm4=t3'=t3+t4*/ - paddw mm4,mm3 - OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70) - OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70) - /*Swap out this 8x4 block for the next one.*/ - movq mm0,[0x08+X] - movq [0x30+Y],mm7 - movq mm7,[0x78+X] - movq [0x50+Y],mm1 - movq mm1,[0x18+X] - movq [0x20+Y],mm6 - movq mm6,[0x68+X] - movq [0x60+Y],mm2 - movq mm2,[0x28+X] - movq [0x10+Y],mm5 - movq mm5,[0x58+X] - movq [0x70+Y],mm3 - movq mm3,[0x38+X] - /*And increase its working precision, too.*/ - psllw mm0,2 - movq [0x00+Y],mm4 - psllw mm7,2 - movq mm4,[0x48+X] - /*We inline stage1 of the transform here so we can get better instruction - scheduling with the shifts.*/ - /*mm0=t7'=t0-t7*/ - psubw mm0,mm7 - psllw mm1,2 - paddw mm7,mm7 - psllw mm6,2 - /*mm1=t6'=t1-t6*/ - psubw mm1,mm6 - psllw mm2,2 - paddw mm6,mm6 - psllw mm5,2 - /*mm2=t5'=t2-t5*/ - psubw mm2,mm5 - psllw mm3,2 - paddw mm5,mm5 - psllw mm4,2 - /*mm3=t4'=t3-t4*/ - psubw mm3,mm4 - paddw mm4,mm4 - /*mm7=t0'=t0+t7*/ - paddw mm7,mm0 - /*mm6=t1'=t1+t6*/ - paddw mm6,mm1 - /*mm5=t2'=t2+t5*/ - paddw mm5,mm2 - /*mm4=t3'=t3+t4*/ - paddw mm4,mm3 - OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78) - OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78) - /*Here the first 4x4 block of output from the last transpose is the second - 4x4 block of input for the next transform. - We have cleverly arranged that it already be in the appropriate place, - so we only have to do half the stores and loads.*/ - movq mm0,[0x00+Y] - movq [0x58+Y],mm1 - movq mm1,[0x10+Y] - movq [0x68+Y],mm2 - movq mm2,[0x20+Y] - movq [0x78+Y],mm3 - movq mm3,[0x30+Y] - OC_FDCT_STAGE1_8x4 - OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) - OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) - /*mm0={-2}x4*/ - pcmpeqw mm0,mm0 - paddw mm0,mm0 - /*Round the results.*/ - psubw mm1,mm0 - psubw mm2,mm0 - psraw mm1,2 - psubw mm3,mm0 - movq [0x18+Y],mm1 - psraw mm2,2 - psubw mm4,mm0 - movq mm1,[0x08+Y] - psraw mm3,2 - psubw mm5,mm0 - psraw mm4,2 - psubw mm6,mm0 - psraw mm5,2 - psubw mm7,mm0 - psraw mm6,2 - psubw mm1,mm0 - psraw mm7,2 - movq mm0,[0x40+Y] - psraw mm1,2 - movq [0x30+Y],mm7 - movq mm7,[0x78+Y] - movq [0x08+Y],mm1 - movq mm1,[0x50+Y] - movq [0x20+Y],mm6 - movq mm6,[0x68+Y] - movq [0x28+Y],mm2 - movq mm2,[0x60+Y] - movq [0x10+Y],mm5 - movq mm5,[0x58+Y] - movq [0x38+Y],mm3 - movq mm3,[0x70+Y] - movq [0x00+Y],mm4 - movq mm4,[0x48+Y] - OC_FDCT_STAGE1_8x4 - OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) - OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) - /*mm0={-2}x4*/ - pcmpeqw mm0,mm0 - paddw mm0,mm0 - /*Round the results.*/ - psubw mm1,mm0 - psubw mm2,mm0 - psraw mm1,2 - psubw mm3,mm0 - movq [0x58+Y],mm1 - psraw mm2,2 - psubw mm4,mm0 - movq mm1,[0x48+Y] - psraw mm3,2 - psubw mm5,mm0 - movq [0x68+Y],mm2 - psraw mm4,2 - psubw mm6,mm0 - movq [0x78+Y],mm3 - psraw mm5,2 - psubw mm7,mm0 - movq [0x40+Y],mm4 - psraw mm6,2 - psubw mm1,mm0 - movq [0x50+Y],mm5 - psraw mm7,2 - movq [0x60+Y],mm6 - psraw mm1,2 - movq [0x70+Y],mm7 - movq [0x48+Y],mm1 -#undef Y -#undef A -#undef X - } -} - -#endif +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ********************************************************************/ + /*MMX fDCT implementation for x86_32*/ +/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/ +#include "x86enc.h" + +#if defined(OC_X86_ASM) + +#define OC_FDCT_STAGE1_8x4 __asm{ \ + /*Stage 1:*/ \ + /*mm0=t7'=t0-t7*/ \ + __asm psubw mm0,mm7 \ + __asm paddw mm7,mm7 \ + /*mm1=t6'=t1-t6*/ \ + __asm psubw mm1, mm6 \ + __asm paddw mm6,mm6 \ + /*mm2=t5'=t2-t5*/ \ + __asm psubw mm2,mm5 \ + __asm paddw mm5,mm5 \ + /*mm3=t4'=t3-t4*/ \ + __asm psubw mm3,mm4 \ + __asm paddw mm4,mm4 \ + /*mm7=t0'=t0+t7*/ \ + __asm paddw mm7,mm0 \ + /*mm6=t1'=t1+t6*/ \ + __asm paddw mm6,mm1 \ + /*mm5=t2'=t2+t5*/ \ + __asm paddw mm5,mm2 \ + /*mm4=t3'=t3+t4*/ \ + __asm paddw mm4,mm3\ +} + +#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \ + /*Stage 2:*/ \ + /*mm7=t3''=t0'-t3'*/ \ + __asm psubw mm7,mm4 \ + __asm paddw mm4,mm4 \ + /*mm6=t2''=t1'-t2'*/ \ + __asm psubw mm6,mm5 \ + __asm movq [Y+_r6],mm7 \ + __asm paddw mm5,mm5 \ + /*mm1=t5''=t6'-t5'*/ \ + __asm psubw mm1,mm2 \ + __asm movq [Y+_r2],mm6 \ + /*mm4=t0''=t0'+t3'*/ \ + __asm paddw mm4,mm7 \ + __asm paddw mm2,mm2 \ + /*mm5=t1''=t1'+t2'*/ \ + __asm movq [Y+_r0],mm4 \ + __asm paddw mm5,mm6 \ + /*mm2=t6''=t6'+t5'*/ \ + __asm paddw mm2,mm1 \ + __asm movq [Y+_r4],mm5 \ + /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \ + /*mm4, mm5, mm6, mm7 are free.*/ \ + /*Stage 3:*/ \ + /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \ + __asm mov A,0x5A806A0A \ + __asm pcmpeqb mm6,mm6 \ + __asm movd mm7,A \ + __asm psrlw mm6,15 \ + __asm punpckldq mm7,mm7 \ + __asm paddw mm6,mm6 \ + /*mm0=0, m2={-1}x4 \ + mm5:mm4=t5''*27146+0xB500*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm punpcklwd mm4,mm6 \ + __asm movq [Y+_r3],mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm movq [Y+_r7],mm0 \ + __asm punpckhwd mm5,mm6 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqb mm2,mm2 \ + /*mm2=t6'', mm1=t5''+(t5''!=0) \ + mm4=(t5''*27146+0xB500>>16)*/ \ + __asm pcmpeqw mm0,mm1 \ + __asm psrad mm4,16 \ + __asm psubw mm0,mm2 \ + __asm movq mm2, [Y+_r3] \ + __asm psrad mm5,16 \ + __asm paddw mm1,mm0 \ + __asm packssdw mm4,mm5 \ + /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \ + __asm paddw mm4,mm1 \ + __asm movq mm0, [Y+_r7] \ + __asm psraw mm4,1 \ + __asm movq mm1,mm3 \ + /*mm3=t4''=t4'+s*/ \ + __asm paddw mm3,mm4 \ + /*mm1=t5'''=t4'-s*/ \ + __asm psubw mm1,mm4 \ + /*mm1=0, mm3={-1}x4 \ + mm5:mm4=t6''*27146+0xB500*/ \ + __asm movq mm4,mm2 \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm4,mm6 \ + __asm movq [Y+_r5],mm1 \ + __asm pmaddwd mm4,mm7 \ + __asm movq [Y+_r1],mm3 \ + __asm punpckhwd mm5,mm6 \ + __asm pxor mm1,mm1 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqb mm3,mm3 \ + /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \ + __asm psrad mm4,16 \ + __asm pcmpeqw mm1,mm2 \ + __asm psrad mm5,16 \ + __asm psubw mm1,mm3 \ + __asm packssdw mm4,mm5 \ + __asm paddw mm2,mm1 \ + /*mm1=t1'' \ + mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \ + __asm paddw mm4,mm2 \ + __asm movq mm1,[Y+_r4] \ + __asm psraw mm4,1 \ + __asm movq mm2,mm0 \ + /*mm7={54491-0x7FFF,0x7FFF}x2 \ + mm0=t7''=t7'+s*/ \ + __asm paddw mm0,mm4 \ + /*mm2=t6'''=t7'-s*/ \ + __asm psubw mm2,mm4 \ + /*Stage 4:*/ \ + /*mm0=0, mm2=t0'' \ + mm5:mm4=t1''*27146+0xB500*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm punpcklwd mm4,mm6 \ + __asm movq [Y+_r3],mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm movq mm2,[Y+_r0] \ + __asm punpckhwd mm5,mm6 \ + __asm movq [Y+_r7],mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pxor mm0,mm0 \ + /*mm7={27146,0x4000>>1}x2 \ + mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \ + __asm psrad mm4,16 \ + __asm mov A,0x20006A0A \ + __asm pcmpeqw mm0,mm1 \ + __asm movd mm7,A \ + __asm psrad mm5,16 \ + __asm psubw mm0,mm3 \ + __asm packssdw mm4,mm5 \ + __asm paddw mm0,mm1 \ + __asm punpckldq mm7,mm7 \ + __asm paddw mm0,mm4 \ + /*mm6={0x00000E3D}x2 \ + mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \ + __asm movq mm4,mm2 \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm4,mm6 \ + __asm mov A,0x0E3D \ + __asm pmaddwd mm4,mm7 \ + __asm punpckhwd mm5,mm6 \ + __asm movd mm6,A \ + __asm pmaddwd mm5,mm7 \ + __asm pxor mm1,mm1 \ + __asm punpckldq mm6,mm6 \ + __asm pcmpeqw mm1,mm2 \ + /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \ + __asm psrad mm4,16 \ + __asm psubw mm1,mm3 \ + __asm psrad mm5,16 \ + __asm paddw mm2,mm1 \ + __asm packssdw mm4,mm5 \ + __asm movq mm1,[Y+_r5] \ + __asm paddw mm4,mm2 \ + /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \ + The naive implementation could cause overflow, so we use \ + u=(r&s)+((r^s)>>1).*/ \ + __asm movq mm2,[Y+_r3] \ + __asm movq mm7,mm0 \ + __asm pxor mm0,mm4 \ + __asm pand mm7,mm4 \ + __asm psraw mm0,1 \ + __asm mov A,0x7FFF54DC \ + __asm paddw mm0,mm7 \ + __asm movd mm7,A \ + /*mm7={54491-0x7FFF,0x7FFF}x2 \ + mm4=_y[4]=v=r-u*/ \ + __asm psubw mm4,mm0 \ + __asm punpckldq mm7,mm7 \ + __asm movq [Y+_r4],mm4 \ + /*mm0=0, mm7={36410}x4 \ + mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm punpcklwd mm4,mm1 \ + __asm mov A,0x8E3A8E3A \ + __asm pmaddwd mm4,mm7 \ + __asm movq [Y+_r0],mm0 \ + __asm punpckhwd mm5,mm1 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqw mm1,mm0 \ + __asm movd mm7,A \ + __asm psubw mm1,mm3 \ + __asm punpckldq mm7,mm7 \ + __asm paddd mm4,mm6 \ + __asm paddd mm5,mm6 \ + /*mm0=0 \ + mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \ + __asm movq mm6,mm2 \ + __asm movq mm3,mm2 \ + __asm pmulhw mm6,mm7 \ + __asm paddw mm1,mm2 \ + __asm pmullw mm3,mm7 \ + __asm pxor mm0,mm0 \ + __asm paddw mm6,mm1 \ + __asm movq mm1,mm3 \ + __asm punpckhwd mm3,mm6 \ + __asm punpcklwd mm1,mm6 \ + /*mm3={-1}x4, mm6={1}x4 \ + mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \ + __asm paddd mm5,mm3 \ + __asm paddd mm4,mm1 \ + __asm psrad mm5,16 \ + __asm pxor mm6,mm6 \ + __asm psrad mm4,16 \ + __asm pcmpeqb mm3,mm3 \ + __asm packssdw mm4,mm5 \ + __asm psubw mm6,mm3 \ + /*mm1=t7'', mm7={26568,0x3400}x2 \ + mm2=s=t6'''-(36410*u>>16)*/ \ + __asm movq mm1,mm4 \ + __asm mov A,0x340067C8 \ + __asm pmulhw mm4,mm7 \ + __asm movd mm7,A \ + __asm movq [Y+_r5],mm1 \ + __asm punpckldq mm7,mm7 \ + __asm paddw mm4,mm1 \ + __asm movq mm1,[Y+_r7] \ + __asm psubw mm2,mm4 \ + /*mm6={0x00007B1B}x2 \ + mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \ + __asm movq mm4,mm2 \ + __asm movq mm5,mm2 \ + __asm punpcklwd mm4,mm6 \ + __asm pcmpeqw mm0,mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x7B1B \ + __asm punpckhwd mm5,mm6 \ + __asm movd mm6,A \ + __asm pmaddwd mm5,mm7 \ + __asm psubw mm0,mm3 \ + __asm punpckldq mm6,mm6 \ + /*mm7={64277-0x7FFF,0x7FFF}x2 \ + mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \ + __asm psrad mm4,17 \ + __asm paddw mm2,mm0 \ + __asm psrad mm5,17 \ + __asm mov A,0x7FFF7B16 \ + __asm packssdw mm4,mm5 \ + __asm movd mm7,A \ + __asm paddw mm2,mm4 \ + __asm punpckldq mm7,mm7 \ + /*mm0=0, mm7={12785}x4 \ + mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm movq [Y+_r3],mm2 \ + __asm punpcklwd mm4,mm1 \ + __asm movq mm2,[Y+_r1] \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x31F131F1 \ + __asm punpckhwd mm5,mm1 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm pcmpeqw mm1,mm0 \ + __asm movd mm7,A \ + __asm psubw mm1,mm3 \ + __asm punpckldq mm7,mm7 \ + __asm paddd mm4,mm6 \ + __asm paddd mm5,mm6 \ + /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \ + __asm movq mm6,mm2 \ + __asm movq mm3,mm2 \ + __asm pmulhw mm6,mm7 \ + __asm pmullw mm3,mm7 \ + __asm paddw mm6,mm1 \ + __asm movq mm1,mm3 \ + __asm punpckhwd mm3,mm6 \ + __asm punpcklwd mm1,mm6 \ + /*mm3={-1}x4, mm6={1}x4 \ + mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \ + __asm paddd mm5,mm3 \ + __asm paddd mm4,mm1 \ + __asm psrad mm5,16 \ + __asm pxor mm6,mm6 \ + __asm psrad mm4,16 \ + __asm pcmpeqb mm3,mm3 \ + __asm packssdw mm4,mm5 \ + __asm psubw mm6,mm3 \ + /*mm1=t3'', mm7={20539,0x3000}x2 \ + mm4=s=(12785*u>>16)-t4''*/ \ + __asm movq [Y+_r1],mm4 \ + __asm pmulhw mm4,mm7 \ + __asm mov A,0x3000503B \ + __asm movq mm1,[Y+_r6] \ + __asm movd mm7,A \ + __asm psubw mm4,mm2 \ + __asm punpckldq mm7,mm7 \ + /*mm6={0x00006CB7}x2 \ + mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \ + __asm movq mm5,mm4 \ + __asm movq mm2,mm4 \ + __asm punpcklwd mm4,mm6 \ + __asm pcmpeqw mm0,mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x6CB7 \ + __asm punpckhwd mm5,mm6 \ + __asm movd mm6,A \ + __asm pmaddwd mm5,mm7 \ + __asm psubw mm0,mm3 \ + __asm punpckldq mm6,mm6 \ + /*mm7={60547-0x7FFF,0x7FFF}x2 \ + mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \ + __asm psrad mm4,20 \ + __asm paddw mm2,mm0 \ + __asm psrad mm5,20 \ + __asm mov A,0x7FFF6C84 \ + __asm packssdw mm4,mm5 \ + __asm movd mm7,A \ + __asm paddw mm2,mm4 \ + __asm punpckldq mm7,mm7 \ + /*mm0=0, mm7={25080}x4 \ + mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \ + __asm movq mm4,mm1 \ + __asm movq mm5,mm1 \ + __asm movq [Y+_r7],mm2 \ + __asm punpcklwd mm4,mm1 \ + __asm movq mm2,[Y+_r2] \ + __asm pmaddwd mm4,mm7 \ + __asm mov A,0x61F861F8 \ + __asm punpckhwd mm5,mm1 \ + __asm pxor mm0,mm0 \ + __asm pmaddwd mm5,mm7 \ + __asm movd mm7,A \ + __asm pcmpeqw mm1,mm0 \ + __asm psubw mm1,mm3 \ + __asm punpckldq mm7,mm7 \ + __asm paddd mm4,mm6 \ + __asm paddd mm5,mm6 \ + /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \ + __asm movq mm6,mm2 \ + __asm movq mm3,mm2 \ + __asm pmulhw mm6,mm7 \ + __asm pmullw mm3,mm7 \ + __asm paddw mm6,mm1 \ + __asm movq mm1,mm3 \ + __asm punpckhwd mm3,mm6 \ + __asm punpcklwd mm1,mm6 \ + /*mm1={-1}x4 \ + mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \ + __asm paddd mm5,mm3 \ + __asm paddd mm4,mm1 \ + __asm psrad mm5,16 \ + __asm mov A,0x28005460 \ + __asm psrad mm4,16 \ + __asm pcmpeqb mm1,mm1 \ + __asm packssdw mm4,mm5 \ + /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \ + mm4=s=(25080*u>>16)-t2''*/ \ + __asm movq mm6,mm4 \ + __asm pmulhw mm4,mm7 \ + __asm pxor mm5,mm5 \ + __asm movd mm7,A \ + __asm psubw mm5,mm1 \ + __asm punpckldq mm7,mm7 \ + __asm psubw mm4,mm2 \ + /*mm2=s+(s!=0) \ + mm4:mm3=s*21600+0x2800*/ \ + __asm movq mm3,mm4 \ + __asm movq mm2,mm4 \ + __asm punpckhwd mm4,mm5 \ + __asm pcmpeqw mm0,mm2 \ + __asm pmaddwd mm4,mm7 \ + __asm psubw mm0,mm1 \ + __asm punpcklwd mm3,mm5 \ + __asm paddw mm2,mm0 \ + __asm pmaddwd mm3,mm7 \ + /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \ + mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \ + __asm movq mm0,[Y+_r4] \ + __asm psrad mm4,18 \ + __asm movq mm5,[Y+_r5] \ + __asm psrad mm3,18 \ + __asm movq mm1,[Y+_r7] \ + __asm packssdw mm3,mm4 \ + __asm movq mm4,[Y+_r0] \ + __asm paddw mm3,mm2 \ +} + +/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7]. + On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and + {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/ +#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \ + /*First 4x4 transpose:*/ \ + /*mm0 = e3 e2 e1 e0 \ + mm5 = f3 f2 f1 f0 \ + mm3 = g3 g2 g1 g0 \ + mm1 = h3 h2 h1 h0*/ \ + __asm movq mm2,mm0 \ + __asm punpcklwd mm0,mm5 \ + __asm punpckhwd mm2,mm5 \ + __asm movq mm5,mm3 \ + __asm punpcklwd mm3,mm1 \ + __asm punpckhwd mm5,mm1 \ + /*mm0 = f1 e1 f0 e0 \ + mm2 = f3 e3 f2 e2 \ + mm3 = h1 g1 h0 g0 \ + mm5 = h3 g3 h2 g2*/ \ + __asm movq mm1,mm0 \ + __asm punpckldq mm0,mm3 \ + __asm movq [Y+_r4],mm0 \ + __asm punpckhdq mm1,mm3 \ + __asm movq mm0,[Y+_r1] \ + __asm movq mm3,mm2 \ + __asm punpckldq mm2,mm5 \ + __asm punpckhdq mm3,mm5 \ + __asm movq mm5,[Y+_r3] \ + /*_y[4] = h0 g0 f0 e0 \ + mm1 = h1 g1 f1 e1 \ + mm2 = h2 g2 f2 e2 \ + mm3 = h3 g3 f3 e3*/ \ + /*Second 4x4 transpose:*/ \ + /*mm4 = a3 a2 a1 a0 \ + mm0 = b3 b2 b1 b0 \ + mm6 = c3 c2 c1 c0 \ + mm5 = d3 d2 d1 d0*/ \ + __asm movq mm7,mm4 \ + __asm punpcklwd mm4,mm0 \ + __asm punpckhwd mm7,mm0 \ + __asm movq mm0,mm6 \ + __asm punpcklwd mm6,mm5 \ + __asm punpckhwd mm0,mm5 \ + /*mm4 = b1 a1 b0 a0 \ + mm7 = b3 a3 b2 a2 \ + mm6 = d1 c1 d0 c0 \ + mm0 = d3 c3 d2 c2*/ \ + __asm movq mm5,mm4 \ + __asm punpckldq mm4,mm6 \ + __asm punpckhdq mm5,mm6 \ + __asm movq mm6,mm7 \ + __asm punpckhdq mm7,mm0 \ + __asm punpckldq mm6,mm0 \ + /*mm4 = d0 c0 b0 a0 \ + mm5 = d1 c1 b1 a1 \ + mm6 = d2 c2 b2 a2 \ + mm7 = d3 c3 b3 a3*/ \ +} + +/*MMX implementation of the fDCT.*/ +void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ + ptrdiff_t a; + __asm{ +#define Y eax +#define A ecx +#define X edx + /*Add two extra bits of working precision to improve accuracy; any more and + we could overflow.*/ + /*We also add biases to correct for some systematic error that remains in + the full fDCT->iDCT round trip.*/ + mov X, _x + mov Y, _y + movq mm0,[0x00+X] + movq mm1,[0x10+X] + movq mm2,[0x20+X] + movq mm3,[0x30+X] + pcmpeqb mm4,mm4 + pxor mm7,mm7 + movq mm5,mm0 + psllw mm0,2 + pcmpeqw mm5,mm7 + movq mm7,[0x70+X] + psllw mm1,2 + psubw mm5,mm4 + psllw mm2,2 + mov A,1 + pslld mm5,16 + movd mm6,A + psllq mm5,16 + mov A,0x10001 + psllw mm3,2 + movd mm4,A + punpckhwd mm5,mm6 + psubw mm1,mm6 + movq mm6,[0x60+X] + paddw mm0,mm5 + movq mm5,[0x50+X] + paddw mm0,mm4 + movq mm4,[0x40+X] + /*We inline stage1 of the transform here so we can get better instruction + scheduling with the shifts.*/ + /*mm0=t7'=t0-t7*/ + psllw mm7,2 + psubw mm0,mm7 + psllw mm6,2 + paddw mm7,mm7 + /*mm1=t6'=t1-t6*/ + psllw mm5,2 + psubw mm1,mm6 + psllw mm4,2 + paddw mm6,mm6 + /*mm2=t5'=t2-t5*/ + psubw mm2,mm5 + paddw mm5,mm5 + /*mm3=t4'=t3-t4*/ + psubw mm3,mm4 + paddw mm4,mm4 + /*mm7=t0'=t0+t7*/ + paddw mm7,mm0 + /*mm6=t1'=t1+t6*/ + paddw mm6,mm1 + /*mm5=t2'=t2+t5*/ + paddw mm5,mm2 + /*mm4=t3'=t3+t4*/ + paddw mm4,mm3 + OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70) + OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70) + /*Swap out this 8x4 block for the next one.*/ + movq mm0,[0x08+X] + movq [0x30+Y],mm7 + movq mm7,[0x78+X] + movq [0x50+Y],mm1 + movq mm1,[0x18+X] + movq [0x20+Y],mm6 + movq mm6,[0x68+X] + movq [0x60+Y],mm2 + movq mm2,[0x28+X] + movq [0x10+Y],mm5 + movq mm5,[0x58+X] + movq [0x70+Y],mm3 + movq mm3,[0x38+X] + /*And increase its working precision, too.*/ + psllw mm0,2 + movq [0x00+Y],mm4 + psllw mm7,2 + movq mm4,[0x48+X] + /*We inline stage1 of the transform here so we can get better instruction + scheduling with the shifts.*/ + /*mm0=t7'=t0-t7*/ + psubw mm0,mm7 + psllw mm1,2 + paddw mm7,mm7 + psllw mm6,2 + /*mm1=t6'=t1-t6*/ + psubw mm1,mm6 + psllw mm2,2 + paddw mm6,mm6 + psllw mm5,2 + /*mm2=t5'=t2-t5*/ + psubw mm2,mm5 + psllw mm3,2 + paddw mm5,mm5 + psllw mm4,2 + /*mm3=t4'=t3-t4*/ + psubw mm3,mm4 + paddw mm4,mm4 + /*mm7=t0'=t0+t7*/ + paddw mm7,mm0 + /*mm6=t1'=t1+t6*/ + paddw mm6,mm1 + /*mm5=t2'=t2+t5*/ + paddw mm5,mm2 + /*mm4=t3'=t3+t4*/ + paddw mm4,mm3 + OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78) + OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78) + /*Here the first 4x4 block of output from the last transpose is the second + 4x4 block of input for the next transform. + We have cleverly arranged that it already be in the appropriate place, + so we only have to do half the stores and loads.*/ + movq mm0,[0x00+Y] + movq [0x58+Y],mm1 + movq mm1,[0x10+Y] + movq [0x68+Y],mm2 + movq mm2,[0x20+Y] + movq [0x78+Y],mm3 + movq mm3,[0x30+Y] + OC_FDCT_STAGE1_8x4 + OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) + OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) + /*mm0={-2}x4*/ + pcmpeqw mm0,mm0 + paddw mm0,mm0 + /*Round the results.*/ + psubw mm1,mm0 + psubw mm2,mm0 + psraw mm1,2 + psubw mm3,mm0 + movq [0x18+Y],mm1 + psraw mm2,2 + psubw mm4,mm0 + movq mm1,[0x08+Y] + psraw mm3,2 + psubw mm5,mm0 + psraw mm4,2 + psubw mm6,mm0 + psraw mm5,2 + psubw mm7,mm0 + psraw mm6,2 + psubw mm1,mm0 + psraw mm7,2 + movq mm0,[0x40+Y] + psraw mm1,2 + movq [0x30+Y],mm7 + movq mm7,[0x78+Y] + movq [0x08+Y],mm1 + movq mm1,[0x50+Y] + movq [0x20+Y],mm6 + movq mm6,[0x68+Y] + movq [0x28+Y],mm2 + movq mm2,[0x60+Y] + movq [0x10+Y],mm5 + movq mm5,[0x58+Y] + movq [0x38+Y],mm3 + movq mm3,[0x70+Y] + movq [0x00+Y],mm4 + movq mm4,[0x48+Y] + OC_FDCT_STAGE1_8x4 + OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) + OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) + /*mm0={-2}x4*/ + pcmpeqw mm0,mm0 + paddw mm0,mm0 + /*Round the results.*/ + psubw mm1,mm0 + psubw mm2,mm0 + psraw mm1,2 + psubw mm3,mm0 + movq [0x58+Y],mm1 + psraw mm2,2 + psubw mm4,mm0 + movq mm1,[0x48+Y] + psraw mm3,2 + psubw mm5,mm0 + movq [0x68+Y],mm2 + psraw mm4,2 + psubw mm6,mm0 + movq [0x78+Y],mm3 + psraw mm5,2 + psubw mm7,mm0 + movq [0x40+Y],mm4 + psraw mm6,2 + psubw mm1,mm0 + movq [0x50+Y],mm5 + psraw mm7,2 + movq [0x60+Y],mm6 + psraw mm1,2 + movq [0x70+Y],mm7 + movq [0x48+Y],mm1 +#undef Y +#undef A +#undef X + } +} + +#endif diff --git a/thirdparty/nanosvg/LICENSE.txt b/thirdparty/nanosvg/LICENSE.txt index 6fde401cb2..f896f2eb0f 100644 --- a/thirdparty/nanosvg/LICENSE.txt +++ b/thirdparty/nanosvg/LICENSE.txt @@ -1,18 +1,18 @@ -Copyright (c) 2013-14 Mikko Mononen memon@inside.org - -This software is provided 'as-is', without any express or implied -warranty. In no event will the authors be held liable for any damages -arising from the use of this software. - -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it -freely, subject to the following restrictions: - -1. The origin of this software must not be misrepresented; you must not -claim that you wrote the original software. If you use this software -in a product, an acknowledgment in the product documentation would be -appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not be -misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. - +Copyright (c) 2013-14 Mikko Mononen memon@inside.org + +This software is provided 'as-is', without any express or implied +warranty. In no event will the authors be held liable for any damages +arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not +claim that you wrote the original software. If you use this software +in a product, an acknowledgment in the product documentation would be +appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be +misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +