Implements asin, acos, atan, and atan2.
Also includes some tests in a new test file.
One possible problem here is that I'm not sure how to test what Microsoft's atan and atan2 outputs are in boundary cases like atan2(1, 0). I've made the test suites adhere with the calculator program I've been using (Qalculate, which I assume is using libc's atan2).
From: Petrichor Park ppark@codeweavers.com
Also includes some tests in a new test file.
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=55154 --- Makefile.am | 1 + libs/vkd3d-shader/hlsl.y | 75 +++++++++++++++++++++++++++++ tests/hlsl/inverse-trig.shader_test | 30 ++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 tests/hlsl/inverse-trig.shader_test
diff --git a/Makefile.am b/Makefile.am index 744e46127..39294833c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -109,6 +109,7 @@ vkd3d_shader_tests = \ tests/hlsl/initializer-struct.shader_test \ tests/hlsl/intrinsic-override.shader_test \ tests/hlsl/invalid.shader_test \ + tests/hlsl/inverse-trig.shader_test \ tests/hlsl/is-front-face.shader_test \ tests/hlsl/ldexp.shader_test \ tests/hlsl/length.shader_test \ diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index fb6d485ea..d2a045adf 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -2525,6 +2525,73 @@ static bool intrinsic_abs(struct hlsl_ctx *ctx, return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_ABS, params->args[0], loc); }
+/* + Based on Microsoft's D3D compiler's output. + Approximate `acos(|x|) / sqrt(1 - |x|))` using a cubic, then correct + by multiplying sqrt(1 - |x|) and flipping across pi if x<0. + acos has a slope of -infinity around 1, + which polynomial approximations can't do very well. + 1/sqrt(1-x) has a slope of infinity around 1, + so multiplying the two ends up with a reasonably flat slope + that a polynomial can approximate. +*/ +static bool write_acos_or_asin(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc, bool asin_mode) +{ + struct hlsl_ir_function_decl *func; + struct hlsl_type *type; + char *body; + + static const char template[] = + "%s %s(%s x)\n" + "{\n" + " %s abs_arg = abs(x);\n" + " %s correction = sqrt(1.0 - abs_arg);\n" + " %s result = correction * (\n" + " (3.14159265 / 2.0)\n" + " - (0.2127403136003234 * abs_arg)\n" + " + (0.07612092595257536 * abs_arg * abs_arg)\n" + " - (0.01996337677405357 * abs_arg * abs_arg * abs_arg)\n" + " );\n" + /* Piecewise, (x >= 0) ? result : pi - result. */ + " %s mask = %s(x >= 0.0);\n" + " result = mask * result\n" + " + (1.0 - mask) * (3.14159265 - result);\n" + " return %s;" + "}"; + static const char fn_name_acos[] = "acos"; + static const char fn_name_asin[] = "asin"; + static const char return_stmt_acos[] = "result"; + static const char return_stmt_asin[] = "-result + (3.14159265 / 2.0)"; + + const char *fn_name = asin_mode + ? fn_name_asin + : fn_name_acos; + + if (!(type = elementwise_intrinsic_get_common_type(ctx, params, loc))) + return false; + type = hlsl_get_numeric_type(ctx, type->class, HLSL_TYPE_FLOAT, type->dimx, type->dimy); + + if (!(body = hlsl_sprintf_alloc(ctx, template, + type->name, fn_name, type->name, + type->name, type->name, type->name, + type->name, type->name, + (asin_mode ? return_stmt_asin : return_stmt_acos)))) + return false; + func = hlsl_compile_internal_function(ctx, fn_name, body); + vkd3d_free(body); + if (!func) + return false; + + return add_user_call(ctx, func, params, loc); +} + +static bool intrinsic_acos(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + return write_acos_or_asin(ctx, params, loc, false); +} + static bool intrinsic_all(struct hlsl_ctx *ctx, const struct parse_initializer *params, const struct vkd3d_shader_location *loc) { @@ -2602,6 +2669,12 @@ static bool intrinsic_any(struct hlsl_ctx *ctx, return false; }
+static bool intrinsic_asin(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + return write_acos_or_asin(ctx, params, loc, true); +} + /* Find the type corresponding to the given source type, with the same * dimensions but a different base type. */ static struct hlsl_type *convert_numeric_type(const struct hlsl_ctx *ctx, @@ -3655,9 +3728,11 @@ intrinsic_functions[] = /* Note: these entries should be kept in alphabetical order. */ {"D3DCOLORtoUBYTE4", 1, true, intrinsic_d3dcolor_to_ubyte4}, {"abs", 1, true, intrinsic_abs}, + {"acos", 1, true, intrinsic_acos}, {"all", 1, true, intrinsic_all}, {"any", 1, true, intrinsic_any}, {"asfloat", 1, true, intrinsic_asfloat}, + {"asin", 1, true, intrinsic_asin}, {"asuint", -1, true, intrinsic_asuint}, {"clamp", 3, true, intrinsic_clamp}, {"clip", 1, true, intrinsic_clip}, diff --git a/tests/hlsl/inverse-trig.shader_test b/tests/hlsl/inverse-trig.shader_test new file mode 100644 index 000000000..7be7c5e7d --- /dev/null +++ b/tests/hlsl/inverse-trig.shader_test @@ -0,0 +1,30 @@ +[pixel shader] +uniform float4 a; + +float4 main() : sv_target +{ + // Add 10 to avoid epsilon problems near 0 + // Note that this means -8.5odd means -pi/2 + return float4(asin(a.x), acos(a.y), 0.0, 0) + 10.0; +} + +[test] +uniform 0 float4 -1.0 -1.0 -1.0 0.0 +draw quad +probe all rgba (8.429203673, 13.141592654, 10.0, 10.0) 256 + +uniform 0 float4 -0.5 -0.5 -0.5 0.0 +draw quad +probe all rgba (9.476401224, 12.09439510, 10.0, 10.0) 256 + +uniform 0 float4 0.0 0.0 0.0 0.0 +draw quad +probe all rgba (10.0, 11.570796327, 10.0, 10.0) 256 + +uniform 0 float4 0.5 0.5 0.5 0.0 +draw quad +probe all rgba (10.52359878, 11.04719755, 10.0, 10.0) 256 + +uniform 0 float4 1.0 1.0 1.0 0.0 +draw quad +probe all rgba (11.570796327, 10.0, 10.0, 10.0) 256
From: Petrichor Park ppark@codeweavers.com
--- libs/vkd3d-shader/hlsl.y | 53 +++++++++++++++++++++++++++++ tests/hlsl/inverse-trig.shader_test | 10 +++--- 2 files changed, 58 insertions(+), 5 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index d2a045adf..74a93e8bd 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -2675,6 +2675,58 @@ static bool intrinsic_asin(struct hlsl_ctx *ctx, return write_acos_or_asin(ctx, params, loc, true); }
+/* + Between -1 and 1, approximates atan with a polynomial of degree 9. + Outside of that range the polynomial goes off to infinity, so instead + calculate the approximation on 1/x and shift it around +- pi/2. +*/ +static bool intrinsic_atan(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_function_decl *func; + struct hlsl_type *type; + char *body; + + static const char template[] = + "%s atan(%s x)\n" + "{\n" + " %s abs_arg = abs(x);\n" + " %s input = (abs_arg < 1.0) ? x : 1.0 / x;\n" + /* HLSL pow(x, y) always returns NaN if x<0. So write out the powers longhand. */ + " %s x2 = input * input;\n" + " %s x3 = input * x2;\n" + " %s x5 = x3 * x2;\n" + " %s x7 = x5 * x2;\n" + " %s x9 = x7 * x2;\n" + " %s poly_approx = \n" + " 0.0202650518398948 * x9\n" + " + -0.0840073996650127 * x7\n" + " + 0.1794166847125958 * x5\n" + " + -0.3301321091300554 * x3\n" + " + 0.9998559356400260 * input;\n" + " return (abs_arg < 1.0)\n" + " ? poly_approx\n" + " : (((x < 0.0 ? -1 : 1) * 3.14159265 / 2.0) - poly_approx);\n" + "}"; + + if (!(type = elementwise_intrinsic_get_common_type(ctx, params, loc))) + return false; + type = hlsl_get_numeric_type(ctx, type->class, HLSL_TYPE_FLOAT, type->dimx, type->dimy); + + + if (!(body = hlsl_sprintf_alloc(ctx, template, type->name, type->name, + type->name, type->name, + type->name, type->name, type->name, type->name, type->name, + type->name))) + return false; + func = hlsl_compile_internal_function(ctx, "atan", body); + vkd3d_free(body); + if (!func) + return false; + + return add_user_call(ctx, func, params, loc); +} + /* Find the type corresponding to the given source type, with the same * dimensions but a different base type. */ static struct hlsl_type *convert_numeric_type(const struct hlsl_ctx *ctx, @@ -3734,6 +3786,7 @@ intrinsic_functions[] = {"asfloat", 1, true, intrinsic_asfloat}, {"asin", 1, true, intrinsic_asin}, {"asuint", -1, true, intrinsic_asuint}, + {"atan", 1, true, intrinsic_atan}, {"clamp", 3, true, intrinsic_clamp}, {"clip", 1, true, intrinsic_clip}, {"cos", 1, true, intrinsic_cos}, diff --git a/tests/hlsl/inverse-trig.shader_test b/tests/hlsl/inverse-trig.shader_test index 7be7c5e7d..8820ec002 100644 --- a/tests/hlsl/inverse-trig.shader_test +++ b/tests/hlsl/inverse-trig.shader_test @@ -5,17 +5,17 @@ float4 main() : sv_target { // Add 10 to avoid epsilon problems near 0 // Note that this means -8.5odd means -pi/2 - return float4(asin(a.x), acos(a.y), 0.0, 0) + 10.0; + return float4(asin(a.x), acos(a.y), atan(a.z), 0) + 10.0; }
[test] uniform 0 float4 -1.0 -1.0 -1.0 0.0 draw quad -probe all rgba (8.429203673, 13.141592654, 10.0, 10.0) 256 +probe all rgba (8.429203673, 13.141592654, 9.214601837, 10.0) 256
uniform 0 float4 -0.5 -0.5 -0.5 0.0 draw quad -probe all rgba (9.476401224, 12.09439510, 10.0, 10.0) 256 +probe all rgba (9.476401224, 12.09439510, 9.536352391, 10.0) 256
uniform 0 float4 0.0 0.0 0.0 0.0 draw quad @@ -23,8 +23,8 @@ probe all rgba (10.0, 11.570796327, 10.0, 10.0) 256
uniform 0 float4 0.5 0.5 0.5 0.0 draw quad -probe all rgba (10.52359878, 11.04719755, 10.0, 10.0) 256 +probe all rgba (10.52359878, 11.04719755, 10.46364761, 10.0) 256
uniform 0 float4 1.0 1.0 1.0 0.0 draw quad -probe all rgba (11.570796327, 10.0, 10.0, 10.0) 256 +probe all rgba (11.570796327, 10.0, 10.7853981634, 10.0) 256
From: Petrichor Park ppark@codeweavers.com
--- libs/vkd3d-shader/hlsl.y | 37 +++++++++++++++++++++++++ tests/hlsl/inverse-trig.shader_test | 43 +++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+)
diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index 74a93e8bd..319451949 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -2727,6 +2727,42 @@ static bool intrinsic_atan(struct hlsl_ctx *ctx, return add_user_call(ctx, func, params, loc); }
+static bool intrinsic_atan2(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_function_decl *func; + struct hlsl_type *type; + char *body; + + static const char template[] = + "%s atan2(%s y, %s x)\n" + "{\n" + " %s pi = %s(3.1415926);\n" + " %s offset = x >= 0.0\n" + " ? 0.0\n" + " : y >= 0.0\n" + " ? pi\n" + " : -pi;\n" + " return atan(y/x) + offset;\n" + "}"; + + if (!(type = elementwise_intrinsic_get_common_type(ctx, params, loc))) + return false; + type = hlsl_get_numeric_type(ctx, type->class, HLSL_TYPE_FLOAT, type->dimx, type->dimy); + + if (!(body = hlsl_sprintf_alloc(ctx, template, + type->name, type->name, type->name, + type->name, type->name, type->name))) + return false; + func = hlsl_compile_internal_function(ctx, "atan2", body); + vkd3d_free(body); + if (!func) + return false; + + return add_user_call(ctx, func, params, loc); +} + + /* Find the type corresponding to the given source type, with the same * dimensions but a different base type. */ static struct hlsl_type *convert_numeric_type(const struct hlsl_ctx *ctx, @@ -3787,6 +3823,7 @@ intrinsic_functions[] = {"asin", 1, true, intrinsic_asin}, {"asuint", -1, true, intrinsic_asuint}, {"atan", 1, true, intrinsic_atan}, + {"atan2", 2, true, intrinsic_atan2}, {"clamp", 3, true, intrinsic_clamp}, {"clip", 1, true, intrinsic_clip}, {"cos", 1, true, intrinsic_cos}, diff --git a/tests/hlsl/inverse-trig.shader_test b/tests/hlsl/inverse-trig.shader_test index 8820ec002..493019984 100644 --- a/tests/hlsl/inverse-trig.shader_test +++ b/tests/hlsl/inverse-trig.shader_test @@ -28,3 +28,46 @@ probe all rgba (10.52359878, 11.04719755, 10.46364761, 10.0) 256 uniform 0 float4 1.0 1.0 1.0 0.0 draw quad probe all rgba (11.570796327, 10.0, 10.7853981634, 10.0) 256 + + +[pixel shader] +uniform float4 a; + +float4 main() : sv_target +{ + // Return things in terms of pi to make the numbers in the + // test cases more understandable. + // Also, because the argument order is (y,x) the numbers are + // passed in "backwards" here, so they're the right way in the + // test cases. + return float4(atan2(a.x, a.y), 0.0, 0.0, 0.0) / 3.14159265; +} + +[test] +uniform 0 float4 1.0 1.0 0.0 0.0 +draw quad +probe all rgba (0.25, 0.0, 0.0, 0.0) 256 + +uniform 0 float4 5.0 -5.0 0.0 0.0 +draw quad +probe all rgba (0.75, 0.0, 0.0, 0.0) 256 + +uniform 0 float4 -3.0 -3.0 0.0 0.0 +draw quad +probe all rgba (-0.75, 0.0, 0.0, 0.0) 256 + +uniform 0 float4 1.0 0.0 0.0 0.0 +draw quad +probe all rgba (0.5, 0.0, 0.0, 0.0) 256 + +uniform 0 float4 -1.0 0.0 0.0 0.0 +draw quad +probe all rgba (-0.5, 0.0, 0.0, 0.0) 256 + +uniform 0 float4 0.0 1.0 0.0 0.0 +draw quad +probe all rgba (0.0, 0.0, 0.0, 0.0) 256 + +uniform 0 float4 0.0 -1.0 0.0 0.0 +draw quad +probe all rgba (1.0, 0.0, 0.0, 0.0) 256
Giovanni Mascellani (@giomasce) commented about libs/vkd3d-shader/hlsl.y:
const struct parse_initializer *params, const struct vkd3d_shader_location *loc, bool asin_mode)
+{
- struct hlsl_ir_function_decl *func;
- struct hlsl_type *type;
- char *body;
- static const char template[] =
"%s %s(%s x)\n"
"{\n"
" %s abs_arg = abs(x);\n"
" %s correction = sqrt(1.0 - abs_arg);\n"
" %s result = correction * (\n"
" (3.14159265 / 2.0)\n"
" - (0.2127403136003234 * abs_arg)\n"
" + (0.07612092595257536 * abs_arg * abs_arg)\n"
" - (0.01996337677405357 * abs_arg * abs_arg * abs_arg)\n"
Notice that native, at least in my tests, is evaluating the polynomial using [Horner's method](https://en.wikipedia.org/wiki/Horner%27s_method), which is probably more efficient (it takes only three multiplications instead of six in this case, if I'm not mistaken). That would amount to something like `(((-0.01996337677405357f * abs_arg + 0.07612092595257536f) * abs_arg - 0.2127403136003234f) * abs_arg + 1.570796325f`.
Notice that floating point numbers do not need so many significant digits ([see this nice tool](https://evanw.github.io/float-toy/)), and that native coefficients are slightly different from yours, not sure why. Where do your coefficients come from? Your tests have a rather large error margin set: while this is not a problem in its own, maybe using the same coefficients as native you could get that smaller.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d-shader/hlsl.y:
" %s mask = %s(x >= 0.0);\n"
" result = mask * result\n"
" + (1.0 - mask) * (3.14159265 - result);\n"
" return %s;"
"}";
- static const char fn_name_acos[] = "acos";
- static const char fn_name_asin[] = "asin";
- static const char return_stmt_acos[] = "result";
- static const char return_stmt_asin[] = "-result + (3.14159265 / 2.0)";
- const char *fn_name = asin_mode
? fn_name_asin
: fn_name_acos;
- if (!(type = elementwise_intrinsic_get_common_type(ctx, params, loc)))
return false;
There's no need to call `elementwise_intrinsic_get_common_type()` when there is just one parameter.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d-shader/hlsl.y:
return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_ABS, params->args[0], loc);
}
+/*
- Based on Microsoft's D3D compiler's output.
- Approximate `acos(|x|) / sqrt(1 - |x|))` using a cubic, then correct
- by multiplying sqrt(1 - |x|) and flipping across pi if x<0.
- acos has a slope of -infinity around 1,
- which polynomial approximations can't do very well.
- 1/sqrt(1-x) has a slope of infinity around 1,
- so multiplying the two ends up with a reasonably flat slope
- that a polynomial can approximate.
+*/
This comment doesn't look very useful to me: virtually everything in the HLSL compiler is based on the Microsoft compiler, since the point of our HLSL compiler is to be a drop-in replacement of native. Given that, there is little point in justifying the numerical algorithm: we're just replicating what the native compiler does, but we're not really interested in justifying this kind of detail.
I reviewed only the first commit for the moment. Some suggestions may apply to the other two too, but let's begin making the first one correct and then look at the others.
On Mon Sep 25 11:23:49 2023 +0000, Giovanni Mascellani wrote:
Notice that native, at least in my tests, is evaluating the polynomial using [Horner's method](https://en.wikipedia.org/wiki/Horner%27s_method), which is probably more efficient (it takes only three multiplications instead of six in this case, if I'm not mistaken). That would amount to something like `(((-0.01996337677405357f * abs_arg + 0.07612092595257536f) * abs_arg - 0.2127403136003234f) * abs_arg + 1.570796325f`. Notice that floating point numbers do not need so many significant digits ([see this nice tool](https://evanw.github.io/float-toy/)), and that native coefficients are slightly different from yours, not sure why. Where do your coefficients come from? Your tests have a rather large error margin set: while this is not a problem in its own, maybe using the same coefficients as native you could get that smaller.
Evan Tang wrote the algorithms for all these. He solved for the coefficients using Wolfram Alpha because he felt a little worried about using Microsoft's output directly. Apparently, Evan's numbers are slightly less accurate overall, but don't yield a discontinuity at X=0.
It doesn't look like he has an account here but we could pull him into the discussion on Matrix?
It doesn't look like he has an account here
That would be @etang-cw :)
On Mon Sep 25 15:41:25 2023 +0000, Petrichor Park wrote:
Evan Tang wrote the algorithms for all these. He solved for the coefficients using Wolfram Alpha because he felt a little worried about using Microsoft's output directly. Apparently, Evan's numbers are slightly less accurate overall, but don't yield a discontinuity at X=0. It doesn't look like he has an account here but we could pull him into the discussion on Matrix?
You don't have to change the coefficients, just accumulate in the opposite order like `((coeff_x3 * x + coeff_x2) * x + coeff_x1) * x + coeff_x0`
All modern GPUs have multiply-add instructions, so that will compile to three madds, which should be nice and fast.
On Mon Sep 25 16:45:04 2023 +0000, Evan Tang wrote:
You don't have to change the coefficients, just accumulate in the opposite order like `((coeff_x3 * x + coeff_x2) * x + coeff_x1) * x + coeff_x0` All modern GPUs have multiply-add instructions, so that will compile to three madds, which should be nice and fast.
And while the atan function is slightly less accurate to avoid the discontinuity, acos is slightly more accurate on average: https://www.desmos.com/calculator/3ncrqzazxa
On Mon Sep 25 16:55:32 2023 +0000, Evan Tang wrote:
And while the atan function is slightly less accurate to avoid the discontinuity, acos is slightly more accurate on average: https://www.desmos.com/calculator/3ncrqzazxa
Anyways, if you think copying coefficients is legally OK and preferable, go for it. I just thought it might be better to have numbers where we can give reasoning for why we chose them.
On Mon Sep 25 11:23:50 2023 +0000, Giovanni Mascellani wrote:
There's no need to call `elementwise_intrinsic_get_common_type()` when there is just one parameter.
True, though there's not much harm in it either.
Zebediah Figura (@zfigura) commented about libs/vkd3d-shader/hlsl.y:
- struct hlsl_type *type;
- char *body;
- static const char template[] =
"%s %s(%s x)\n"
"{\n"
" %s abs_arg = abs(x);\n"
" %s correction = sqrt(1.0 - abs_arg);\n"
" %s result = correction * (\n"
" (3.14159265 / 2.0)\n"
" - (0.2127403136003234 * abs_arg)\n"
" + (0.07612092595257536 * abs_arg * abs_arg)\n"
" - (0.01996337677405357 * abs_arg * abs_arg * abs_arg)\n"
" );\n"
/* Piecewise, (x >= 0) ? result : pi - result. */
" %s mask = %s(x >= 0.0);\n"
HLSL has implicit casts, so you can drop this. I think there's another instance in atan2().