 
            On my Nvidia GeForce GTX 1050 Ti `ddxddy.shader_test` doesn't pass because of considerably different numeric results.
As Giovanni pointed out, this is because my GPU uses the fine derivate and not the coarse derivate to implement ddx() and ddy().
For this reason, the result for ddx|ddy() is quantized so that the test passes if the GPU uses either coarse or fine derivates.
Additionally, tests for both ddx_coarse|ddy_coarse() and ddx_fine|ddy_fine() are added, that expect a more precise result.
-- v2: vkd3d-shader/hlsl: Fine derivates support. vkd3d-shader/hlsl: Coarse derivates support. tests: Quantize ddx() and ddy() test. tests: Make ddx() and ddy() test behave correctly for shader models < 4. tests: Test coarse and fine derivates.
 
            From: Francisco Casas fcasas@codeweavers.com
--- tests/ddxddy.shader_test | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+)
diff --git a/tests/ddxddy.shader_test b/tests/ddxddy.shader_test index 6efb5ab6f..af09803e6 100644 --- a/tests/ddxddy.shader_test +++ b/tests/ddxddy.shader_test @@ -24,3 +24,43 @@ probe (10, 11) rgba (-0.420000076, -0.164999843, 0.104999900, 0.0) 8 probe (11, 11) rgba (-0.574999928, -0.164999843, 0.104999900, 0.0) 8 probe (12, 10) rgba (-0.874999881, -0.205000162, 0.124999881, 0.0) 8 probe (150, 150) rgba (-7.52500916, -1.56500244, 1.50500488, 0.0) 40 + + +[require] +shader model >= 5.0 + + +[pixel shader todo] +float4 main(float4 pos : sv_position) : sv_target +{ + pos /= 10.0; + float nonlinear = pos.x * pos.y - pos.x * (pos.x + 0.5); + return float4(nonlinear, ddx_coarse(nonlinear), ddy_coarse(nonlinear), 0.0); +} + +[test] +todo draw quad +probe (10, 10) rgba (-0.524999976, -0.164999843, 0.104999900, 0.0) 16 +probe (11, 10) rgba (-0.689999819, -0.164999843, 0.104999900, 0.0) 16 +probe (10, 11) rgba (-0.420000076, -0.164999843, 0.104999900, 0.0) 16 +probe (11, 11) rgba (-0.574999928, -0.164999843, 0.104999900, 0.0) 16 +probe (12, 10) rgba (-0.874999881, -0.205000162, 0.124999881, 0.0) 24 +probe (150, 150) rgba (-7.52500916, -1.56500244, 1.50500488, 0.0) 40 + + +[pixel shader todo] +float4 main(float4 pos : sv_position) : sv_target +{ + pos /= 10.0; + float nonlinear = pos.x * pos.y - pos.x * (pos.x + 0.5); + return float4(nonlinear, ddx_fine(nonlinear), ddy_fine(nonlinear), 0.0); +} + +[test] +todo draw quad +probe (10, 10) rgba (-0.524999976, -0.164999843, 0.104999900, 0.0) 16 +probe (11, 10) rgba (-0.689999819, -0.164999843, 0.114999890, 0.0) 16 +probe (10, 11) rgba (-0.420000076, -0.154999852, 0.104999900, 0.0) 16 +probe (11, 11) rgba (-0.574999928, -0.154999852, 0.114999890, 0.0) 16 +probe (12, 10) rgba (-0.874999881, -0.205000162, 0.124999881, 0.0) 24 +probe (150, 150) rgba (-7.52500916, -1.56500244, 1.50500488, 0.0) 40
 
            From: Francisco Casas fcasas@codeweavers.com
--- tests/ddxddy.shader_test | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/tests/ddxddy.shader_test b/tests/ddxddy.shader_test index af09803e6..1ce36751c 100644 --- a/tests/ddxddy.shader_test +++ b/tests/ddxddy.shader_test @@ -1,16 +1,26 @@ +[require] +shader model >= 3.0 + [pixel shader] float4 main(float4 pos : sv_position) : sv_target { return float4(ddx(pos.x), ddy(pos.y), 0, 0); }
+ [test] draw quad probe all rgba (1.0, 1.0, 0.0, 0.0)
+ [pixel shader] float4 main(float4 pos : sv_position) : sv_target { + // Shader models < 4 don't add 0.5 to sv_position, so this adjustment is required to get the + // same outputs. + pos.x = floor(pos.x) + 0.5; + pos.y = floor(pos.y) + 0.5; + pos /= 10.0; float nonlinear = pos.x * pos.y - pos.x * (pos.x + 0.5); return float4(nonlinear, ddx(nonlinear), ddy(nonlinear), 0.0);
 
            From: Francisco Casas fcasas@codeweavers.com
--- tests/ddxddy.shader_test | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/tests/ddxddy.shader_test b/tests/ddxddy.shader_test index 1ce36751c..a181185ef 100644 --- a/tests/ddxddy.shader_test +++ b/tests/ddxddy.shader_test @@ -23,17 +23,21 @@ float4 main(float4 pos : sv_position) : sv_target
pos /= 10.0; float nonlinear = pos.x * pos.y - pos.x * (pos.x + 0.5); - return float4(nonlinear, ddx(nonlinear), ddy(nonlinear), 0.0); + float4 res = float4(nonlinear, ddx(nonlinear), ddy(nonlinear), 0.0); + + // Each device may use either the coarse or the fine derivate, so use quantization. + return round(30 * res); }
[test] draw quad -probe (10, 10) rgba (-0.524999976, -0.164999843, 0.104999900, 0.0) 8 -probe (11, 10) rgba (-0.689999819, -0.164999843, 0.104999900, 0.0) 8 -probe (10, 11) rgba (-0.420000076, -0.164999843, 0.104999900, 0.0) 8 -probe (11, 11) rgba (-0.574999928, -0.164999843, 0.104999900, 0.0) 8 -probe (12, 10) rgba (-0.874999881, -0.205000162, 0.124999881, 0.0) 8 -probe (150, 150) rgba (-7.52500916, -1.56500244, 1.50500488, 0.0) 40 +probe (10, 10) rgba (-16.0, -5.0, 3.0, 0.0) +probe (11, 10) rgba (-21.0, -5.0, 3.0, 0.0) +probe (10, 11) rgba (-13.0, -5.0, 3.0, 0.0) +probe (11, 11) rgba (-17.0, -5.0, 3.0, 0.0) +probe (12, 10) rgba (-26.0, -6.0, 4.0, 0.0) +probe (16, 16) rgba (-25.0, -7.0, 5.0, 0.0) +probe (150, 150) rgba (-226.0, -47.0, 45.0, 0.0)
[require]
 
            From: Francisco Casas fcasas@codeweavers.com
--- libs/vkd3d-shader/hlsl.c | 2 ++ libs/vkd3d-shader/hlsl.h | 2 ++ libs/vkd3d-shader/hlsl.y | 24 ++++++++++++++++++++++++ libs/vkd3d-shader/tpf.c | 10 ++++++++++ tests/ddxddy.shader_test | 4 ++-- 5 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl.c b/libs/vkd3d-shader/hlsl.c index 152ec6275..8b9b41125 100644 --- a/libs/vkd3d-shader/hlsl.c +++ b/libs/vkd3d-shader/hlsl.c @@ -2340,7 +2340,9 @@ const char *debug_hlsl_expr_op(enum hlsl_ir_expr_op op) [HLSL_OP1_COS] = "cos", [HLSL_OP1_COS_REDUCED] = "cos_reduced", [HLSL_OP1_DSX] = "dsx", + [HLSL_OP1_DSX_COARSE] = "dsx_coarse", [HLSL_OP1_DSY] = "dsy", + [HLSL_OP1_DSY_COARSE] = "dsy_coarse", [HLSL_OP1_EXP2] = "exp2", [HLSL_OP1_FRACT] = "fract", [HLSL_OP1_LOG2] = "log2", diff --git a/libs/vkd3d-shader/hlsl.h b/libs/vkd3d-shader/hlsl.h index 7d02448e0..fe002dbc5 100644 --- a/libs/vkd3d-shader/hlsl.h +++ b/libs/vkd3d-shader/hlsl.h @@ -502,7 +502,9 @@ enum hlsl_ir_expr_op HLSL_OP1_COS, HLSL_OP1_COS_REDUCED, /* Reduced range [-pi, pi] */ HLSL_OP1_DSX, + HLSL_OP1_DSX_COARSE, HLSL_OP1_DSY, + HLSL_OP1_DSY_COARSE, HLSL_OP1_EXP2, HLSL_OP1_FLOOR, HLSL_OP1_FRACT, diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index cf483d82c..74f63e4e2 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -2612,6 +2612,17 @@ static bool intrinsic_ddx(struct hlsl_ctx *ctx, return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSX, arg, loc); }
+static bool intrinsic_ddx_coarse(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_node *arg; + + if (!(arg = intrinsic_float_convert_arg(ctx, params, params->args[0], loc))) + return false; + + return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSX_COARSE, arg, loc); +} + static bool intrinsic_ddy(struct hlsl_ctx *ctx, const struct parse_initializer *params, const struct vkd3d_shader_location *loc) { @@ -2623,6 +2634,17 @@ static bool intrinsic_ddy(struct hlsl_ctx *ctx, return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSY, arg, loc); }
+static bool intrinsic_ddy_coarse(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_node *arg; + + if (!(arg = intrinsic_float_convert_arg(ctx, params, params->args[0], loc))) + return false; + + return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSY_COARSE, arg, loc); +} + static bool intrinsic_distance(struct hlsl_ctx *ctx, const struct parse_initializer *params, const struct vkd3d_shader_location *loc) { @@ -3485,7 +3507,9 @@ intrinsic_functions[] = {"cos", 1, true, intrinsic_cos}, {"cross", 2, true, intrinsic_cross}, {"ddx", 1, true, intrinsic_ddx}, + {"ddx_coarse", 1, true, intrinsic_ddx_coarse}, {"ddy", 1, true, intrinsic_ddy}, + {"ddy_coarse", 1, true, intrinsic_ddy_coarse}, {"distance", 2, true, intrinsic_distance}, {"dot", 2, true, intrinsic_dot}, {"exp", 1, true, intrinsic_exp}, diff --git a/libs/vkd3d-shader/tpf.c b/libs/vkd3d-shader/tpf.c index 60948d649..7ffae550b 100644 --- a/libs/vkd3d-shader/tpf.c +++ b/libs/vkd3d-shader/tpf.c @@ -4364,11 +4364,21 @@ static void write_sm4_expr(struct hlsl_ctx *ctx, write_sm4_unary_op(buffer, VKD3D_SM4_OP_DERIV_RTX, &expr->node, arg1, 0); break;
+ case HLSL_OP1_DSX_COARSE: + assert(type_is_float(dst_type)); + write_sm4_unary_op(buffer, VKD3D_SM5_OP_DERIV_RTX_COARSE, &expr->node, arg1, 0); + break; + case HLSL_OP1_DSY: assert(type_is_float(dst_type)); write_sm4_unary_op(buffer, VKD3D_SM4_OP_DERIV_RTY, &expr->node, arg1, 0); break;
+ case HLSL_OP1_DSY_COARSE: + assert(type_is_float(dst_type)); + write_sm4_unary_op(buffer, VKD3D_SM5_OP_DERIV_RTY_COARSE, &expr->node, arg1, 0); + break; + case HLSL_OP1_EXP2: assert(type_is_float(dst_type)); write_sm4_unary_op(buffer, VKD3D_SM4_OP_EXP, &expr->node, arg1, 0); diff --git a/tests/ddxddy.shader_test b/tests/ddxddy.shader_test index a181185ef..f6490f095 100644 --- a/tests/ddxddy.shader_test +++ b/tests/ddxddy.shader_test @@ -44,7 +44,7 @@ probe (150, 150) rgba (-226.0, -47.0, 45.0, 0.0) shader model >= 5.0
-[pixel shader todo] +[pixel shader] float4 main(float4 pos : sv_position) : sv_target { pos /= 10.0; @@ -53,7 +53,7 @@ float4 main(float4 pos : sv_position) : sv_target }
[test] -todo draw quad +draw quad probe (10, 10) rgba (-0.524999976, -0.164999843, 0.104999900, 0.0) 16 probe (11, 10) rgba (-0.689999819, -0.164999843, 0.104999900, 0.0) 16 probe (10, 11) rgba (-0.420000076, -0.164999843, 0.104999900, 0.0) 16
 
            From: Francisco Casas fcasas@codeweavers.com
--- libs/vkd3d-shader/hlsl.c | 2 ++ libs/vkd3d-shader/hlsl.h | 2 ++ libs/vkd3d-shader/hlsl.y | 24 ++++++++++++++++++++++++ libs/vkd3d-shader/tpf.c | 10 ++++++++++ tests/ddxddy.shader_test | 4 ++-- 5 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl.c b/libs/vkd3d-shader/hlsl.c index 8b9b41125..da38435f7 100644 --- a/libs/vkd3d-shader/hlsl.c +++ b/libs/vkd3d-shader/hlsl.c @@ -2341,8 +2341,10 @@ const char *debug_hlsl_expr_op(enum hlsl_ir_expr_op op) [HLSL_OP1_COS_REDUCED] = "cos_reduced", [HLSL_OP1_DSX] = "dsx", [HLSL_OP1_DSX_COARSE] = "dsx_coarse", + [HLSL_OP1_DSX_FINE] = "dsx_fine", [HLSL_OP1_DSY] = "dsy", [HLSL_OP1_DSY_COARSE] = "dsy_coarse", + [HLSL_OP1_DSY_FINE] = "dsy_fine", [HLSL_OP1_EXP2] = "exp2", [HLSL_OP1_FRACT] = "fract", [HLSL_OP1_LOG2] = "log2", diff --git a/libs/vkd3d-shader/hlsl.h b/libs/vkd3d-shader/hlsl.h index fe002dbc5..f7f764128 100644 --- a/libs/vkd3d-shader/hlsl.h +++ b/libs/vkd3d-shader/hlsl.h @@ -503,8 +503,10 @@ enum hlsl_ir_expr_op HLSL_OP1_COS_REDUCED, /* Reduced range [-pi, pi] */ HLSL_OP1_DSX, HLSL_OP1_DSX_COARSE, + HLSL_OP1_DSX_FINE, HLSL_OP1_DSY, HLSL_OP1_DSY_COARSE, + HLSL_OP1_DSY_FINE, HLSL_OP1_EXP2, HLSL_OP1_FLOOR, HLSL_OP1_FRACT, diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index 74f63e4e2..6d1aa8a1d 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -2623,6 +2623,17 @@ static bool intrinsic_ddx_coarse(struct hlsl_ctx *ctx, return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSX_COARSE, arg, loc); }
+static bool intrinsic_ddx_fine(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_node *arg; + + if (!(arg = intrinsic_float_convert_arg(ctx, params, params->args[0], loc))) + return false; + + return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSX_FINE, arg, loc); +} + static bool intrinsic_ddy(struct hlsl_ctx *ctx, const struct parse_initializer *params, const struct vkd3d_shader_location *loc) { @@ -2645,6 +2656,17 @@ static bool intrinsic_ddy_coarse(struct hlsl_ctx *ctx, return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSY_COARSE, arg, loc); }
+static bool intrinsic_ddy_fine(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_node *arg; + + if (!(arg = intrinsic_float_convert_arg(ctx, params, params->args[0], loc))) + return false; + + return !!add_unary_arithmetic_expr(ctx, params->instrs, HLSL_OP1_DSY_FINE, arg, loc); +} + static bool intrinsic_distance(struct hlsl_ctx *ctx, const struct parse_initializer *params, const struct vkd3d_shader_location *loc) { @@ -3508,8 +3530,10 @@ intrinsic_functions[] = {"cross", 2, true, intrinsic_cross}, {"ddx", 1, true, intrinsic_ddx}, {"ddx_coarse", 1, true, intrinsic_ddx_coarse}, + {"ddx_fine", 1, true, intrinsic_ddx_fine}, {"ddy", 1, true, intrinsic_ddy}, {"ddy_coarse", 1, true, intrinsic_ddy_coarse}, + {"ddy_fine", 1, true, intrinsic_ddy_fine}, {"distance", 2, true, intrinsic_distance}, {"dot", 2, true, intrinsic_dot}, {"exp", 1, true, intrinsic_exp}, diff --git a/libs/vkd3d-shader/tpf.c b/libs/vkd3d-shader/tpf.c index 7ffae550b..11edaf860 100644 --- a/libs/vkd3d-shader/tpf.c +++ b/libs/vkd3d-shader/tpf.c @@ -4369,6 +4369,11 @@ static void write_sm4_expr(struct hlsl_ctx *ctx, write_sm4_unary_op(buffer, VKD3D_SM5_OP_DERIV_RTX_COARSE, &expr->node, arg1, 0); break;
+ case HLSL_OP1_DSX_FINE: + assert(type_is_float(dst_type)); + write_sm4_unary_op(buffer, VKD3D_SM5_OP_DERIV_RTX_FINE, &expr->node, arg1, 0); + break; + case HLSL_OP1_DSY: assert(type_is_float(dst_type)); write_sm4_unary_op(buffer, VKD3D_SM4_OP_DERIV_RTY, &expr->node, arg1, 0); @@ -4379,6 +4384,11 @@ static void write_sm4_expr(struct hlsl_ctx *ctx, write_sm4_unary_op(buffer, VKD3D_SM5_OP_DERIV_RTY_COARSE, &expr->node, arg1, 0); break;
+ case HLSL_OP1_DSY_FINE: + assert(type_is_float(dst_type)); + write_sm4_unary_op(buffer, VKD3D_SM5_OP_DERIV_RTY_FINE, &expr->node, arg1, 0); + break; + case HLSL_OP1_EXP2: assert(type_is_float(dst_type)); write_sm4_unary_op(buffer, VKD3D_SM4_OP_EXP, &expr->node, arg1, 0); diff --git a/tests/ddxddy.shader_test b/tests/ddxddy.shader_test index f6490f095..ea772ef37 100644 --- a/tests/ddxddy.shader_test +++ b/tests/ddxddy.shader_test @@ -62,7 +62,7 @@ probe (12, 10) rgba (-0.874999881, -0.205000162, 0.124999881, 0.0) 24 probe (150, 150) rgba (-7.52500916, -1.56500244, 1.50500488, 0.0) 40
-[pixel shader todo] +[pixel shader] float4 main(float4 pos : sv_position) : sv_target { pos /= 10.0; @@ -71,7 +71,7 @@ float4 main(float4 pos : sv_position) : sv_target }
[test] -todo draw quad +draw quad probe (10, 10) rgba (-0.524999976, -0.164999843, 0.104999900, 0.0) 16 probe (11, 10) rgba (-0.689999819, -0.164999843, 0.114999890, 0.0) 16 probe (10, 11) rgba (-0.420000076, -0.154999852, 0.104999900, 0.0) 16
 
            On Mon Jun 19 20:58:58 2023 +0000, Francisco Casas wrote:
changed this line in [version 2 of the diff](/wine/vkd3d/-/merge_requests/224/diffs?diff_id=52578&start_sha=bdb4c92820b51c0072d755a8098fc21e4b89ee8b#5a2a0560f2d42cc35cd4e168080202208e5d100a_3_2)
Correction, the profile ps_2_1 doesn't exist. I confused it with ps_2_x, which the native compiler doesn't seem to support, unless we pass ps_4_0_level_9_1 or ps_4_0_level_9_3, but those don't support sv_position either, so shader model >= 3, would do.
I had to make a change to the test too, because ps_3_0 doesn't add 0.5 to sv_position, like ps_4_0 and ps_5_0.
 
            On Mon Jun 19 21:05:39 2023 +0000, Giovanni Mascellani wrote:
Do you already have implementations for the coarse and fine derivatives? They should rather simple to add. If you can add them to the MR we can actually check that the test values for coarse and fine derivatives are accepted on different devices, to minimize the probability we have to edit them again in the futures.
I added patches for supporting coarse and fine derivates. You were right, the Nvidia GeForce GTX 1050 Ti required some ulps more for these tests to pass.
 
            On Mon Jun 19 21:03:09 2023 +0000, Francisco Casas wrote:
Correction, the profile ps_2_1 doesn't exist. I confused it with ps_2_x, which the native compiler doesn't seem to support, unless we pass ps_4_0_level_9_1 or ps_4_0_level_9_3, but those don't support sv_position either, so shader model >= 3, would do. I had to make a change to the test too, because ps_3_0 doesn't add 0.5 to sv_position, like ps_4_0 and ps_5_0.
Right. It's not actually called 2.1. The actual profile is two different profiles, ps_2_a and ps_2_b, which both compile to a version token of 2.1 in the bytecode. I don't know what the difference between a and b is.
 
            Great! Only one last favor: I'd like llvmpipe to pass tests too. This needs two simple changes: * Despite its name, the coarse derivative can be fine, and in the case of llvmpipe it seems to be. So please quantize it like you do for the unqualified derivative. * llvmpipe is apparently even less precise than NVIDIA. So please raise the ULP limits like in https://gitlab.winehq.org/giomasce/vkd3d/-/commit/e78c53aab6be5cd647de25f47a....
That commit passes tests in all the implementations I have access to (radv, NVIDIA and Intel). If it is compiled in your mesa binaries, you can test on llvmpipe using `VKD3D_VULKAN_DEVICE=1` or something.
Also, we normally use the imperative form in commit subjects: "Support fine derivatives" rather than "Fine derivatives support". Thanks!
 
            Right. It's not actually called 2.1. The actual profile is two different profiles, ps_2_a and ps_2_b, which both compile to a version token of 2.1 in the bytecode. I don't know what the difference between a and b is.
IIRC ps_2_a was the NVIDIA variant and ps_2_b the ATI variant at the time, and they had slightly different features and limits. I'm not sure the differences are particularly well documented anywhere, but you may be able to find something in the GPU vendor documentation from back then.
 
            On Tue Jun 20 12:13:35 2023 +0000, Henri Verbeet wrote:
Right. It's not actually called 2.1. The actual profile is two
different profiles, ps_2_a and ps_2_b, which both compile to a version token of 2.1 in the bytecode. I don't know what the difference between a and b is. IIRC ps_2_a was the NVIDIA variant and ps_2_b the ATI variant at the time, and they had slightly different features and limits. I'm not sure the differences are particularly well documented anywhere, but you may be able to find something in the GPU vendor documentation from back then.
Odd, given they seem to emit the same bytecode. I guess those features must have been checked at compile time.




