This allocates these structs and arrays of structures on the stack instead of the heap (which is expensive and takes time!)
These structures and arrays of structures are small, and this is all super duper hot path code.
We need to forceinline these conversion functions in the compiler as otherwise the alloca would invalidate their inlining.
Signed-off-by: Joshua Ashton joshua@froggi.es --- dlls/winevulkan/make_vulkan | 142 ++---------------------------------- dlls/winevulkan/vulkan.c | 13 +--- 2 files changed, 8 insertions(+), 147 deletions(-)
diff --git a/dlls/winevulkan/make_vulkan b/dlls/winevulkan/make_vulkan index 3593410041..954d484933 100755 --- a/dlls/winevulkan/make_vulkan +++ b/dlls/winevulkan/make_vulkan @@ -666,13 +666,6 @@ class VkFunction(object):
body += p.copy(Direction.OUTPUT)
- # Perform any required cleanups. Most of these are for array functions. - for p in self.params: - if not p.needs_free(): - continue - - body += p.free() - # Finally return the result. if self.type != "void": body += " return result;\n" @@ -1084,9 +1077,6 @@ class VkMember(object): else: conversions.append(ConversionFunction(False, False, direction, struct))
- if self.needs_free(): - conversions.append(FreeFunction(self.is_dynamic_array(), struct)) - return conversions
def is_const(self): @@ -1152,17 +1142,6 @@ class VkMember(object): struct = self.type_info["data"] return struct.needs_conversion()
- def needs_free(self): - if not self.needs_conversion(): - return False - - if self.is_dynamic_array(): - return True - - # TODO: some non-pointer structs and optional pointer structs may need freeing, - # though none of this type have been encountered yet. - return False - def needs_struct_extensions_conversion(self): if not self.is_struct(): return False @@ -1248,11 +1227,6 @@ class VkParam(object): if self._direction in [Direction.INPUT_OUTPUT, Direction.OUTPUT]: self.output_conv = ConversionFunction(False, self.is_dynamic_array(), Direction.OUTPUT, self.struct)
- # Dynamic arrays, but also some normal structs (e.g. VkCommandBufferBeginInfo) need memory - # allocation and thus some cleanup. - if self.is_dynamic_array() or self.struct.needs_free(): - self.free_func = FreeFunction(self.is_dynamic_array(), self.struct) - def _set_direction(self): """ Internal helper function to set parameter direction (input/output/input_output). """
@@ -1380,20 +1354,6 @@ class VkParam(object): def format_string(self): return self.format_str
- def free(self): - if self.is_dynamic_array(): - if self.struct.returnedonly: - # For returnedonly, counts is stored in a pointer. - return " free_{0}_array({1}_host, *{2});\n".format(self.type, self.name, self.dyn_array_len) - else: - return " free_{0}_array({1}_host, {2});\n".format(self.type, self.name, self.dyn_array_len) - else: - # We are operating on a single structure. Some structs (very rare) contain dynamic members, - # which would need freeing. - if self.struct.needs_free(): - return " free_{0}(&{1}_host);\n".format(self.type, self.name) - return "" - def get_conversions(self): """ Get a list of conversions required for this parameter if any. Parameters which are structures may require conversion between win32 @@ -1430,8 +1390,6 @@ class VkParam(object): conversions.append(self.input_conv) if self.output_conv is not None: conversions.append(self.output_conv) - if self.free_func is not None: - conversions.append(self.free_func)
return conversions
@@ -1479,9 +1437,6 @@ class VkParam(object):
return False
- def needs_free(self): - return self.free_func is not None - def needs_input_conversion(self): return self.input_conv is not None
@@ -1694,17 +1649,6 @@ class VkStruct(Sequence): return True return False
- def needs_free(self): - """ Check if any struct member needs some memory freeing.""" - - for m in self.members: - if m.needs_free(): - return True - - continue - - return False - def needs_struct_extensions_conversion(self): """ Checks if structure extensions in pNext chain need conversion. """ ret = False @@ -1750,7 +1694,10 @@ class ConversionFunction(object): return_type = "{0}_host".format(self.type)
# Generate function prototype. - body = "static inline {0} *{1}(".format(return_type, self.name) + # We *must* forceinline otherwise our alloca will be + # garbage when this function returns. + # Regular inline won't cut it, alloca invalidates regular inlining. + body = "static WINEVULKAN_FORCEINLINE {0} *{1}(".format(return_type, self.name) body += ", ".join(p for p in params) body += ")\n{\n"
@@ -1758,7 +1705,7 @@ class ConversionFunction(object): body += " unsigned int i;\n\n" body += " if (!in) return NULL;\n\n"
- body += " out = heap_alloc(count * sizeof(*out));\n" + body += " out = WINEVULKAN_ALLOCA(count * sizeof(*out));\n"
body += " for (i = 0; i < count; i++)\n" body += " {\n" @@ -1781,7 +1728,7 @@ class ConversionFunction(object): else: params = ["const {0} *in".format(self.type), "{0}_host *out".format(self.type)]
- body = "static inline void {0}(".format(self.name) + body = "static WINEVULKAN_FORCEINLINE void {0}(".format(self.name)
# Generate parameter list body += ", ".join(p for p in params) @@ -1814,7 +1761,7 @@ class ConversionFunction(object): params = ["const {0} *in".format(self.type), "{0} *out_host".format(self.type), "uint32_t count"]
# Generate function prototype. - body = "static inline void {0}(".format(self.name) + body = "static WINEVULKAN_FORCEINLINE void {0}(".format(self.name) body += ", ".join(p for p in params) body += ")\n{\n" body += " unsigned int i;\n\n" @@ -1856,81 +1803,6 @@ class ConversionFunction(object): else: return self._generate_conversion_func()
- -class FreeFunction(object): - def __init__(self, dyn_array, struct): - self.dyn_array = dyn_array - self.struct = struct - self.type = struct.name - - if dyn_array: - self.name = "free_{0}_array".format(self.type) - else: - self.name = "free_{0}".format(self.type) - - def __eq__(self, other): - return self.name == other.name - - def _generate_array_free_func(self): - """ Helper function for cleaning up temporary buffers required for array conversions. """ - - # Generate function prototype. - body = "static inline void {0}({1}_host *in, uint32_t count)\n{{\n".format(self.name, self.type) - - # E.g. VkGraphicsPipelineCreateInfo_host needs freeing for pStages. - if self.struct.needs_free(): - body += " unsigned int i;\n\n" - body += " if (!in) return;\n\n" - body += " for (i = 0; i < count; i++)\n" - body += " {\n" - - for m in self.struct: - if m.needs_conversion() and m.is_dynamic_array(): - if m.is_const(): - # Add a cast to ignore const on conversion structs we allocated ourselves. - body += " free_{0}_array(({0}_host *)in[i].{1}, in[i].{2});\n".format(m.type, m.name, m.dyn_array_len) - else: - body += " free_{0}_array(in[i].{1}, in[i].{2});\n".format(m.type, m.name, m.dyn_array_len) - elif m.needs_conversion(): - LOGGER.error("Unhandled conversion for {0}".format(m.name)) - body += " }\n" - else: - body += " if (!in) return;\n\n" - - body += " heap_free(in);\n" - - body += "}\n\n" - return body - - def _generate_free_func(self): - # E.g. VkCommandBufferBeginInfo.pInheritanceInfo needs freeing. - if not self.struct.needs_free(): - return "" - - # Generate function prototype. - body = "static inline void {0}({1}_host *in)\n{{\n".format(self.name, self.type) - - for m in self.struct: - if m.needs_conversion() and m.is_dynamic_array(): - count = m.dyn_array_len if isinstance(m.dyn_array_len, int) else "in->{0}".format(m.dyn_array_len) - if m.is_const(): - # Add a cast to ignore const on conversion structs we allocated ourselves. - body += " free_{0}_array(({0}_host *)in->{1}, {2});\n".format(m.type, m.name, count) - else: - body += " free_{0}_array(in->{1}, {2});\n".format(m.type, m.name, count) - - body += "}\n\n" - return body - - def definition(self): - if self.dyn_array: - return self._generate_array_free_func() - else: - # Some structures need freeing too if they contain dynamic arrays. - # E.g. VkCommandBufferBeginInfo - return self._generate_free_func() - - class StructChainConversionFunction(object): def __init__(self, direction, struct): self.direction = direction diff --git a/dlls/winevulkan/vulkan.c b/dlls/winevulkan/vulkan.c index 59472bcef8..1cc1b4f61f 100644 --- a/dlls/winevulkan/vulkan.c +++ b/dlls/winevulkan/vulkan.c @@ -520,23 +520,12 @@ void WINAPI wine_vkCmdExecuteCommands(VkCommandBuffer buffer, uint32_t count, if (!buffers || !count) return;
- /* Unfortunately we need a temporary buffer as our command buffers are wrapped. - * This call is called often and if a performance concern, we may want to use - * alloca as we shouldn't need much memory and it needs to be cleaned up after - * the call anyway. - */ - if (!(tmp_buffers = heap_alloc(count * sizeof(*tmp_buffers)))) - { - ERR("Failed to allocate memory for temporary command buffers\n"); - return; - } + tmp_buffers = WINEVULKAN_ALLOCA(count * sizeof(*tmp_buffers));
for (i = 0; i < count; i++) tmp_buffers[i] = buffers[i]->command_buffer;
buffer->device->funcs.p_vkCmdExecuteCommands(buffer->command_buffer, count, tmp_buffers); - - heap_free(tmp_buffers); }
VkResult WINAPI wine_vkCreateDevice(VkPhysicalDevice phys_dev,
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=66438
Your paranoid android.
=== debiant (build log) ===
error: patch failed: dlls/winevulkan/vulkan_private.h:34 error: patch failed: dlls/winevulkan/make_vulkan:666 error: patch failed: dlls/winevulkan/vulkan.c:520 Task: Patch failed to apply
=== debiant (build log) ===
error: patch failed: dlls/winevulkan/vulkan_private.h:34 error: patch failed: dlls/winevulkan/make_vulkan:666 error: patch failed: dlls/winevulkan/vulkan.c:520 Task: Patch failed to apply
On Thu, Mar 5, 2020 at 11:03 AM Joshua Ashton joshua@froggi.es wrote:
This allocates these structs and arrays of structures on the stack instead of the heap (which is expensive and takes time!)
These structures and arrays of structures are small, and this is all super duper hot path code.
Would it make sense to keep the heap allocation as a slow-path if the assumption about the allocation size is wrong? For example, if count is less than 32 use alloca(), otherwise use the heap to avoid a stack overflow.
On 3/5/20 12:25 PM, Andrew Wesie wrote:
On Thu, Mar 5, 2020 at 11:03 AM Joshua Ashton joshua@froggi.es wrote:
This allocates these structs and arrays of structures on the stack instead of the heap (which is expensive and takes time!)
These structures and arrays of structures are small, and this is all super duper hot path code.
Would it make sense to keep the heap allocation as a slow-path if the assumption about the allocation size is wrong? For example, if count is less than 32 use alloca(), otherwise use the heap to avoid a stack overflow.
Agreed, I think this is a case where it would be good to get some performance numbers to help tune our expectations for what the high-mark for would be for normal size/usage.
For the wine_vkCmdExecuteCommands() implementation I'm not too concerned with any limit over 64-bytes (which I would consider small for some of the other structures I'll list below), since according to pahole(1) VkCommandBuffer_T takes up 20-bytes, and off-memory I've never seen a single call to vkCmdExecuteCommands supply a count of more than three command buffers.
Here's a list of the different _host conversion structs and their respective sizes:
VkMemoryHeap_host 12 VkBufferDeviceAddressInfoKHR_host 16 VkBufferMemoryRequirementsInfo2_host 16 VkCommandBufferBeginInfo_host 16 VkDeviceMemoryOpaqueCaptureAddressInfoKHR_host 16 VkImageMemoryRequirementsInfo2_host 16 VkImageSparseMemoryRequirementsInfo2_host 16 VkPerformanceMarkerInfoINTEL_host 16 VkPipelineInfoKHR_host 16 VkSparseBufferMemoryBindInfo_host 16 VkSparseImageMemoryBindInfo_host 16 VkSparseImageOpaqueMemoryBindInfo_host 16 VkAccelerationStructureMemoryRequirementsInfoNV_host 20 VkAcquireProfilingLockInfoKHR_host 20 VkDescriptorImageInfo_host 20 VkMemoryAllocateInfo_host 20 VkMemoryRequirements_host 20 VkPipelineExecutableInfoKHR_host 20 VkBufferCopy_host 24 VkCommandBufferAllocateInfo_host 24 VkDescriptorBufferInfo_host 24 VkDescriptorSetAllocateInfo_host 24 VkPerformanceOverrideInfoINTEL_host 24 VkSemaphoreSignalInfoKHR_host 24 VkAccelerationStructureInfoNV_host 28 VkConditionalRenderingBeginInfoEXT_host 28 VkMemoryRequirements2_host 28 VkMemoryRequirements2KHR_host 28 VkBindBufferMemoryInfo_host 32 VkBindImageMemoryInfo_host 32 VkGeometryAABBNV_host 32 VkImageFormatProperties_host 32 VkMappedMemoryRange_host 32 VkPipelineShaderStageCreateInfo_host 32 VkBufferCreateInfo_host 36 VkSparseMemoryBind_host 36 VkBindAccelerationStructureMemoryInfoNV_host 40 VkBufferViewCreateInfo_host 40 VkCommandBufferInheritanceInfo_host 40 VkFramebufferCreateInfo_host 40 VkImageFormatProperties2_host 40 VkSubresourceLayout_host 40 VkAccelerationStructureCreateInfoNV_host 44 VkAcquireNextImageInfoKHR_host 44 VkCopyDescriptorSet_host 44 VkWriteDescriptorSet_host 44 VkBindSparseInfo_host 48 VkBufferMemoryBarrier_host 48 VkDescriptorUpdateTemplateCreateInfo_host 48 VkRenderPassBeginInfo_host 48 VkRayTracingPipelineCreateInfoNV_host 52 VkBufferImageCopy_host 56 VkSparseImageMemoryBind_host 56 VkImageMemoryBarrier_host 60 VkComputePipelineCreateInfo_host 64 VkImageViewCreateInfo_host 64 VkGeometryTrianglesNV_host 80 VkSwapchainCreateInfoKHR_host 84 VkGraphicsPipelineCreateInfo_host 88 VkGeometryDataNV_host 112 VkGeometryNV_host 128 VkPhysicalDeviceMemoryProperties_host 456 VkPhysicalDeviceMemoryProperties2_host 464 VkPhysicalDeviceLimits_host 488 VkPhysicalDeviceProperties_host 800 VkPhysicalDeviceProperties2_host 808
My only concern here would be with the VkPhysicalDevice* structures, namely VkPhysicalDeviceMemoryProperties2_host since it's usage in VK_EXT_memory_budget gives it the potential to be used in a hot-path (or at the very least lukewarm-path) for an application.
How about setting the threshold that Andrew is suggesting to 4k bytes and seeing if that strikes the right balance of perf/safety?
Thanks,
Liam Middlebrook
----------------------------------------------------------------------------------- This email message is for the sole use of the intended recipient(s) and may contain confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message. -----------------------------------------------------------------------------------
On Thu, Mar 5, 2020 at 9:26 PM Andrew Wesie awesie@gmail.com wrote:
On Thu, Mar 5, 2020 at 11:03 AM Joshua Ashton joshua@froggi.es wrote:
This allocates these structs and arrays of structures on the stack instead of the heap (which is expensive and takes time!)
These structures and arrays of structures are small, and this is all super duper hot path code.
Would it make sense to keep the heap allocation as a slow-path if the assumption about the allocation size is wrong? For example, if count is less than 32 use alloca(), otherwise use the heap to avoid a stack overflow.
Or even use a fixed-size stack array in the "small" case to avoid alloca() entirely.
Am 05.03.20 um 23:25 schrieb Andrew Wesie:
Would it make sense to keep the heap allocation as a slow-path if the assumption about the allocation size is wrong? For example, if count is less than 32 use alloca(), otherwise use the heap to avoid a stack overflow.
Similar suggestion: If we keep the heap codepath anyway and have a max alloca size, would it make sense to have a fixed size local buffer instead of alloca for the fast case and heap_alloc if something huge comes along?