From 8b7dcd7a4479fb67210a648b1dbf76b61b4a7bfb Mon Sep 17 00:00:00 2001 From: Raboneko <119771935+raboneko@users.noreply.github.com> Date: Sat, 9 Aug 2025 21:40:57 -0700 Subject: [PATCH] feat: Add AMD Anti-lag backport to Mesa 25.2.0 (#6041) (#6042) * chore: Bump mesa release number * chore: Update bazzite.patch * chore: Add some missing commits * chore: Explicitly enable anti-lag layer in spec * chore: Add anti-lag files to spec --------- (cherry picked from commit 7dcb88942c1f05d11d9aedaf297a32b6c7a21092) Signed-off-by: Kyle Gospodnetich Co-authored-by: Kyle Gospodnetich --- anda/lib/mesa/bazzite.patch | 2264 ++++++++++++++++++++++++++++++++++- anda/lib/mesa/mesa.spec | 6 +- 2 files changed, 2249 insertions(+), 21 deletions(-) diff --git a/anda/lib/mesa/bazzite.patch b/anda/lib/mesa/bazzite.patch index a0d329361f..e05c18db0c 100644 --- a/anda/lib/mesa/bazzite.patch +++ b/anda/lib/mesa/bazzite.patch @@ -1,16 +1,7 @@ -From cc3cc28e7b1e76d3640be7a497271475fdcfc550 Mon Sep 17 00:00:00 2001 -From: Antheas Kapenekakis -Date: Sat, 15 Mar 2025 16:39:08 +0100 -Subject: [PATCH 1/8] [BEGIN] SteamOS Changes - --- -2.50.1 - - From 21b062a757a202dcb737d40442b6145c34bb1e48 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Fri, 14 Jan 2022 15:58:45 +0100 -Subject: [PATCH 2/8] STEAMOS: radv: min image count override for FH5 +Subject: [PATCH 01/11] STEAMOS: radv: min image count override for FH5 Otherwise in combination with the vblank time reservation in gamescope the game could get stuck in low power states. @@ -39,8 +30,8 @@ index b82e8d4da4d..c8d059571ad 100644 From e837814b4f33e48eaf6a79975cb738da39ed0fd2 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 22 Feb 2024 22:32:45 +0100 -Subject: [PATCH 3/8] STEAMOS: Dynamic swapchain override for gamescope limiter - for DRI3 only +Subject: [PATCH 02/11] STEAMOS: Dynamic swapchain override for gamescope + limiter for DRI3 only The original patch (from Bas) contained WSI VK support too but it's been removed because the Gamescope WSI layer already handles that. @@ -146,7 +137,7 @@ index 26f138d1b83..3f0f3f66fac 100644 From 354cf8783e49b082c97982f2e5be305ad6e4ab50 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Sat, 15 Mar 2025 16:39:25 +0100 -Subject: [PATCH 4/8] [BEGIN] SteamOS Backports +Subject: [PATCH 03/11] [BEGIN] SteamOS Backports -- 2.50.1 @@ -155,7 +146,7 @@ Subject: [PATCH 4/8] [BEGIN] SteamOS Backports From c5a4eab20075dfa2f2bdfb87e55ecec262ef00f6 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Sat, 15 Mar 2025 16:39:33 +0100 -Subject: [PATCH 5/8] [BEGIN] Our Mesa backports +Subject: [PATCH 04/11] [BEGIN] Our Mesa backports -- 2.50.1 @@ -164,7 +155,7 @@ Subject: [PATCH 5/8] [BEGIN] Our Mesa backports From 221b11df6d9cd7b66c8502fa51d8d72cfc377e5e Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Mon, 24 Mar 2025 19:50:51 +0100 -Subject: [PATCH 6/8] Revert "winsys/amdgpu: use VM_ALWAYS_VALID for all VRAM +Subject: [PATCH 05/11] Revert "winsys/amdgpu: use VM_ALWAYS_VALID for all VRAM and GTT allocations" This reverts commit 8c91624614c1f939974fe0d2d1a3baf83335cecb. @@ -194,19 +185,2254 @@ index d5646e9660b..a51348b44a8 100644 2.50.1 -From 21c90507cdbb7c2ca23b5d59421b28ac8081051f Mon Sep 17 00:00:00 2001 +From cf8c0d66ed49f99d0d259c28fe72174d58c06de7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= +Date: Mon, 24 Mar 2025 21:25:29 +0100 +Subject: [PATCH 06/11] vulkan: implement VK_AMD_anti_lag as implicit vulkan + layer + +VkLayer_MESA_anti_lag is a lightweight implicit layer which provides +an open-source implementation of the VK_AMD_anti_lag vulkan extension. + +The algorithm used by this layer is very simplistic and only aims to +minimize the delay between calls to vkQueueSubmit or vkQueueSubmit2 +and the begin of the execution of the submission. + +In order to build VkLayer_MESA_anti_lag, pass -Dlayers=anti-lag to meson. +It is possible to either install the layer or to use + + VK_ADD_IMPLICIT_LAYER_PATH=/share/vulkan/implicit_layer.d/ + +for testing purposes. +(Keep in mind that you have to adjust the library_path in the json file in that case.) + +Part-of: +--- + meson.build | 1 + + meson.options | 2 +- + .../anti-lag-layer/VkLayer_MESA_anti_lag.json | 26 + + src/vulkan/anti-lag-layer/anti_lag_layer.c | 590 ++++++++++++ + src/vulkan/anti-lag-layer/anti_lag_layer.h | 111 +++ + .../anti-lag-layer/anti_lag_layer_interface.c | 899 ++++++++++++++++++ + src/vulkan/anti-lag-layer/meson.build | 26 + + src/vulkan/anti-lag-layer/ringbuffer.h | 58 ++ + src/vulkan/meson.build | 3 + + 9 files changed, 1715 insertions(+), 1 deletion(-) + create mode 100644 src/vulkan/anti-lag-layer/VkLayer_MESA_anti_lag.json + create mode 100644 src/vulkan/anti-lag-layer/anti_lag_layer.c + create mode 100644 src/vulkan/anti-lag-layer/anti_lag_layer.h + create mode 100644 src/vulkan/anti-lag-layer/anti_lag_layer_interface.c + create mode 100644 src/vulkan/anti-lag-layer/meson.build + create mode 100644 src/vulkan/anti-lag-layer/ringbuffer.h + +diff --git a/meson.build b/meson.build +index 427cfde435c..c6c6457abae 100644 +--- a/meson.build ++++ b/meson.build +@@ -95,6 +95,7 @@ with_vulkan_overlay_layer = get_option('vulkan-layers').contains('overlay') + with_vulkan_device_select_layer = get_option('vulkan-layers').contains('device-select') + with_vulkan_screenshot_layer = get_option('vulkan-layers').contains('screenshot') + with_vulkan_vram_report_limit_layer = get_option('vulkan-layers').contains('vram-report-limit') ++with_vulkan_anti_lag_layer = get_option('vulkan-layers').contains('anti-lag') + with_tools = get_option('tools') + if with_tools.contains('all') + with_tools = [ +diff --git a/meson.options b/meson.options +index c3c02c4c94f..cd0e56cc429 100644 +--- a/meson.options ++++ b/meson.options +@@ -299,7 +299,7 @@ option( + type : 'array', + value : [], + choices : [ +- 'device-select', 'intel-nullhw', 'overlay', 'screenshot', ++ 'device-select', 'intel-nullhw', 'overlay', 'screenshot', 'anti-lag', + 'vram-report-limit', + ], + description : 'List of vulkan layers to build' +diff --git a/src/vulkan/anti-lag-layer/VkLayer_MESA_anti_lag.json b/src/vulkan/anti-lag-layer/VkLayer_MESA_anti_lag.json +new file mode 100644 +index 00000000000..4e2ab794c9e +--- /dev/null ++++ b/src/vulkan/anti-lag-layer/VkLayer_MESA_anti_lag.json +@@ -0,0 +1,26 @@ ++{ ++ "file_format_version": "1.2.1", ++ "layer": { ++ "name": "VK_LAYER_MESA_anti_lag", ++ "type": "GLOBAL", ++ "library_path": "libVkLayer_MESA_anti_lag.so", ++ "api_version": "1.4.303", ++ "implementation_version": "1", ++ "description": "Open-source implementation of the VK_AMD_anti_lag extension.", ++ "functions": { ++ "vkNegotiateLoaderLayerInterfaceVersion": "anti_lag_NegotiateLoaderLayerInterfaceVersion" ++ }, ++ "device_extensions": [ ++ { ++ "name": "VK_AMD_anti_lag", ++ "spec_version": "1", ++ "entrypoints": [ ++ "vkAntiLagUpdateAMD" ++ ] ++ } ++ ], ++ "disable_environment": { ++ "DISABLE_LAYER_MESA_ANTI_LAG": "1" ++ } ++ } ++} +\ No newline at end of file +diff --git a/src/vulkan/anti-lag-layer/anti_lag_layer.c b/src/vulkan/anti-lag-layer/anti_lag_layer.c +new file mode 100644 +index 00000000000..6c21e074024 +--- /dev/null ++++ b/src/vulkan/anti-lag-layer/anti_lag_layer.c +@@ -0,0 +1,590 @@ ++/* ++ * Copyright © 2025 Valve Corporation ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "anti_lag_layer.h" ++#include ++#include "util/os_time.h" ++#include "util/simple_mtx.h" ++#include "vulkan/vulkan_core.h" ++#include "ringbuffer.h" ++#include "vk_alloc.h" ++#include "vk_util.h" ++ ++static bool ++evaluate_frame(device_context *ctx, frame *frame, bool force_wait) ++{ ++ if (frame->state != FRAME_PRESENT) { ++ /* This frame is not finished yet. */ ++ assert(!force_wait); ++ return false; ++ } ++ ++ int query_flags = VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT; ++ const uint32_t frame_idx = ringbuffer_index(ctx->frames, frame); ++ ++ /* Before we commit to completing a frame, all submits on all queues must have completed. */ ++ for (unsigned i = 0; i < ctx->num_queues; i++) { ++ queue_context *queue_ctx = &ctx->queues[i]; ++ ringbuffer_lock(queue_ctx->queries); ++ uint64_t expected_signal_value = queue_ctx->semaphore_value - queue_ctx->queries.size + ++ queue_ctx->submissions_per_frame[frame_idx]; ++ ringbuffer_unlock(queue_ctx->queries); ++ ++ if (force_wait) { ++ /* Wait for the timeline semaphore of the frame to be signaled. */ ++ struct VkSemaphoreWaitInfo wait_info = { ++ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, ++ .semaphoreCount = 1, ++ .pSemaphores = &queue_ctx->semaphore, ++ .pValues = &expected_signal_value, ++ }; ++ ctx->vtable.WaitSemaphores(ctx->device, &wait_info, 0); ++ } else { ++ /* Return early if the last timeline semaphore of the frame has not been signaled yet. */ ++ uint64_t signal_value; ++ ctx->vtable.GetSemaphoreCounterValue(ctx->device, queue_ctx->semaphore, &signal_value); ++ if (signal_value < expected_signal_value) ++ return false; ++ } ++ } ++ ++ /* For each queue, retrieve timestamp query results. */ ++ for (unsigned i = 0; i < ctx->num_queues; i++) { ++ queue_context *queue_ctx = &ctx->queues[i]; ++ ++ /* As we hold a global mtx and this is the only place where queries are free'd, ++ * we don't need to lock the query ringbuffer here in order to read the first entry. ++ */ ++ struct query *query = ringbuffer_first(queue_ctx->queries); ++ uint32_t query_idx = ringbuffer_index(queue_ctx->queries, query); ++ int num_timestamps = ++ MIN2(queue_ctx->submissions_per_frame[frame_idx], MAX_QUERIES - query_idx); ++ ++ while (num_timestamps > 0) { ++ /* Retreive timestamp results from this queue. */ ++ ctx->vtable.GetQueryPoolResults(ctx->device, queue_ctx->queryPool, query_idx, ++ num_timestamps, sizeof(uint64_t), &query->begin_gpu_ts, ++ sizeof(struct query), query_flags); ++ ++ ringbuffer_lock(queue_ctx->queries); ++ for (unsigned j = 0; j < num_timestamps; j++) { ++ ++ /* Calibrate device timestamps. */ ++ query->begin_gpu_ts = ++ ctx->calibration.delta + ++ (uint64_t)(query->begin_gpu_ts * ctx->calibration.timestamp_period); ++ if (query->begin_gpu_ts > query->submit_cpu_ts) ++ frame->min_delay = ++ MIN2(frame->min_delay, query->begin_gpu_ts - query->submit_cpu_ts); ++ ++ /* Check if we can reset half of the query pool at once. */ ++ uint32_t next_idx = ringbuffer_index(queue_ctx->queries, query) + 1; ++ const bool reset = next_idx == MAX_QUERIES || next_idx == MAX_QUERIES / 2; ++ if (reset) { ++ ringbuffer_unlock(queue_ctx->queries); ++ ctx->vtable.ResetQueryPool(ctx->device, queue_ctx->queryPool, ++ next_idx - MAX_QUERIES / 2, MAX_QUERIES / 2); ++ ringbuffer_lock(queue_ctx->queries); ++ } ++ ++ /* Free query. */ ++ ringbuffer_free(queue_ctx->queries, query); ++ queue_ctx->submissions_per_frame[frame_idx]--; ++ ++ query = ringbuffer_first(queue_ctx->queries); ++ } ++ ++ /* Ensure that the total number of queries across all frames is correct. */ ++ ASSERTED uint32_t count = 0; ++ for (unsigned i = 0; i < MAX_FRAMES; i++) ++ count += queue_ctx->submissions_per_frame[i]; ++ assert(count == queue_ctx->queries.size); ++ ++ query_idx = ringbuffer_index(queue_ctx->queries, query); ++ num_timestamps = ++ MIN2(queue_ctx->submissions_per_frame[frame_idx], MAX_QUERIES - query_idx); ++ ++ ringbuffer_unlock(queue_ctx->queries); ++ } ++ } ++ ++ frame->min_delay++; /* wrap UINT64_MAX in case we didn't have any submissions. */ ++ ++ return true; ++} ++ ++static bool ++calibrate_timestamps(device_context *ctx) ++{ ++ uint64_t ts[2]; ++ uint64_t deviation; ++ ++ VkCalibratedTimestampInfoKHR info[2] = { ++ { ++ .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR, ++ .timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR, ++ }, ++ { ++ .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR, ++ .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR, ++ }, ++ }; ++ ++ VkResult result = ctx->vtable.GetCalibratedTimestampsKHR(ctx->device, 2, info, ts, &deviation); ++ if (result == VK_SUCCESS) { ++ /* We take a moving average in order to avoid variance. */ ++ int64_t new_delta = ts[0] - (int64_t)(ts[1] * ctx->calibration.timestamp_period); ++ ++ if (ctx->calibration.delta == 0) { ++ ctx->calibration.delta = new_delta; ++ } else { ++ int64_t diff = new_delta - ctx->calibration.delta; ++ ctx->calibration.delta += diff / 8; ++ } ++ ++ /* Take a new calibrated timestamp every second. */ ++ ctx->calibration.recalibrate_when = ts[0] + 1000000000ull; ++ } ++ ++ return result == VK_SUCCESS; ++} ++ ++static void ++begin_next_frame(device_context *ctx) ++{ ++ frame *next_frame; ++ if (ctx->active_frame) { ++ assert(ctx->active_frame->state == FRAME_SUBMIT); ++ ctx->active_frame->state = FRAME_PRESENT; ++ next_frame = ringbuffer_next(ctx->frames, ctx->active_frame); ++ } else { ++ next_frame = ringbuffer_last(ctx->frames); ++ } ++ ++ /* If there is a frame ready, it becomes active. */ ++ if (next_frame->state == FRAME_INPUT) { ++ next_frame->state = FRAME_SUBMIT; ++ ctx->active_frame = next_frame; ++ } else { ++ ctx->active_frame = NULL; ++ } ++} ++ ++static void ++anti_lag_disable(device_context *ctx) ++{ ++ ringbuffer_lock(ctx->frames); ++ while (ctx->frames.size) { ++ /* Set force-wait=true, so that all pending timestamp queries get completed. */ ++ begin_next_frame(ctx); ++ frame *frame = ringbuffer_first(ctx->frames); ++ evaluate_frame(ctx, frame, true); ++ frame->state = FRAME_INVALID; ++ ringbuffer_free(ctx->frames, frame); ++ } ++ assert(!ctx->active_frame); ++ ringbuffer_unlock(ctx->frames); ++} ++ ++#define TARGET_DELAY 4000000ll /* 4 ms */ ++/** ++ * Returns the amount of time that we want the next frame to be delayed. ++ * ++ * The algorithm used by this function is very simplistic and only aims ++ * to minimize the delay between calls to vkQueueSubmit or vkQueueSubmit2 ++ * and the begin of the execution of the submission. ++ */ ++static int64_t ++get_wait_time(device_context *ctx) ++{ ++ /* Take the previous evaluated frame's delay as baseline. */ ++ int64_t imposed_delay = ctx->base_delay; ++ int64_t adaptation = 0; ++ ++ ringbuffer_lock(ctx->frames); ++ /* In case our ringbuffer is completely full and no frame is in PRESENT stage, ++ * just move the oldest frame to PRESENT stage, and force-wait. ++ */ ++ bool force_wait = ctx->frames.size == MAX_FRAMES; ++ frame *next_frame = ringbuffer_first(ctx->frames); ++ if (force_wait && next_frame->state != FRAME_PRESENT) ++ begin_next_frame(ctx); ++ ++ /* Also force-wait for the oldest frame if there is already 2 frames in PRESENT stage. */ ++ force_wait |= ringbuffer_next(ctx->frames, next_frame)->state == FRAME_PRESENT; ++ ringbuffer_unlock(ctx->frames); ++ ++ /* Take new evaluated frames into consideration. */ ++ while (evaluate_frame(ctx, next_frame, force_wait)) { ++ ++ if (next_frame->min_delay < TARGET_DELAY / 2 && ctx->adaptation <= 0) { ++ /* If there is no delay between submission and GPU start, halve the base delay and ++ * set the delay for this frame to zero, in order to account for sudden changes. ++ */ ++ ctx->base_delay = ctx->base_delay / 2; ++ adaptation = -ctx->base_delay; ++ } else { ++ /* We use some kind of exponential weighted moving average function here, ++ * in order to determine a base-delay. We use a smoothing-factor of roughly ++ * 3%, but don't discount the previous value. This helps keeping the delay ++ * slightly below the target of 5 ms, most of the time. ++ */ ++ int64_t diff = (int64_t)next_frame->min_delay - TARGET_DELAY; ++ ctx->base_delay = MAX2(0, ctx->base_delay + diff / 32); /* corresponds to ~3 % */ ++ ++ /* As the base-delay gets adjusted rather slowly, we additionally use the half of the ++ * diff as adaptation delay to account for sudden changes. A quarter of the adaptation ++ * is then subtracted for the next frame, so that we can avoid overcompensation. ++ */ ++ adaptation = diff / 2 - ctx->adaptation / 4; ++ } ++ ++ /* We only need space for one frame. */ ++ force_wait = false; ++ ++ ringbuffer_lock(ctx->frames); ++ next_frame->state = FRAME_INVALID; ++ ringbuffer_free(ctx->frames, next_frame); ++ next_frame = ringbuffer_first(ctx->frames); ++ ringbuffer_unlock(ctx->frames); ++ } ++ imposed_delay = ctx->base_delay + adaptation; ++ ctx->adaptation = adaptation; ++ ++ if (imposed_delay > 100000000) { ++ /* This corresponds to <10 FPS. Something might have gone wrong. */ ++ calibrate_timestamps(ctx); ++ ctx->base_delay = ctx->adaptation = imposed_delay = 0; ++ } ++ ++ return MAX2(0, imposed_delay); ++} ++ ++static void ++reset_frame(frame *frame) ++{ ++ assert(frame->state == FRAME_INVALID); ++ frame->frame_idx = 0; ++ frame->frame_start_time = 0; ++ frame->min_delay = UINT64_MAX; ++ frame->state = FRAME_INPUT; ++} ++ ++VKAPI_ATTR void VKAPI_CALL ++anti_lag_AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD *pData) ++{ ++ if (pData == NULL) ++ return; ++ ++ device_context *ctx = get_device_context(device); ++ if (pData->mode == VK_ANTI_LAG_MODE_OFF_AMD) { ++ /* Application request to disable Anti-Lag. */ ++ simple_mtx_lock(&ctx->mtx); ++ anti_lag_disable(ctx); ++ simple_mtx_unlock(&ctx->mtx); ++ return; ++ } ++ ++ uint64_t frame_idx = 0; ++ int64_t now = os_time_get_nano(); ++ int64_t imposed_delay = 0; ++ int64_t last_frame_begin = 0; ++ ++ if (pData->pPresentationInfo) { ++ /* The same frameIndex value should be used with VK_ANTI_LAG_STAGE_INPUT_AMD before ++ * the frame begins and with VK_ANTI_LAG_STAGE_PRESENT_AMD when the frame ends. ++ */ ++ frame_idx = pData->pPresentationInfo->frameIndex; ++ ++ /* This marks the end of the current frame. */ ++ if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_PRESENT_AMD) { ++ /* If there is already a new frame pending, any submission that happens afterwards ++ * gets associated with the new frame. ++ */ ++ ringbuffer_lock(ctx->frames); ++ /* Check that the currently active frame is indeed the frame we are ending now. */ ++ while (ctx->active_frame && ctx->active_frame->frame_idx <= frame_idx) { ++ begin_next_frame(ctx); ++ } ++ ringbuffer_unlock(ctx->frames); ++ return; ++ } ++ } ++ ++ /* Lock this function, in order to avoid race conditions on frame allocation. */ ++ simple_mtx_lock(&ctx->mtx); ++ ++ /* VK_ANTI_LAG_STAGE_INPUT_AMD: This marks the begin of a new frame. ++ * Evaluate previous frames in order to determine the wait time. ++ */ ++ imposed_delay = get_wait_time(ctx); ++ int64_t next_deadline = now + imposed_delay; ++ ++ /* Ensure maxFPS adherence. */ ++ if (pData->maxFPS) { ++ int64_t frametime_period = 1000000000u / pData->maxFPS; ++ last_frame_begin = ringbuffer_last(ctx->frames)->frame_start_time; ++ next_deadline = MAX2(next_deadline, last_frame_begin + frametime_period); ++ } ++ ++ /* Recalibrate every now and then. */ ++ if (next_deadline > ctx->calibration.recalibrate_when) ++ calibrate_timestamps(ctx); ++ ++ /* Sleep until deadline is met. */ ++ os_time_nanosleep_until(next_deadline); ++ ++ /* Initialize new frame. */ ++ ringbuffer_lock(ctx->frames); ++ frame *new_frame = ringbuffer_alloc(ctx->frames); ++ reset_frame(new_frame); ++ new_frame->frame_start_time = next_deadline; ++ new_frame->imposed_delay = imposed_delay; ++ new_frame->frame_idx = frame_idx; ++ ++ /* Immediately set the frame active if there is no other frame already active. */ ++ if (!ctx->active_frame) ++ begin_next_frame(ctx); ++ ++ ringbuffer_unlock(ctx->frames); ++ simple_mtx_unlock(&ctx->mtx); ++} ++ ++static queue_context * ++get_queue_context(device_context *ctx, VkQueue queue) ++{ ++ for (unsigned i = 0; i < ctx->num_queues; i++) { ++ if (ctx->queues[i].queue == queue) ++ return &ctx->queues[i]; ++ } ++ ++ return NULL; ++} ++ ++static struct query * ++allocate_query(device_context *ctx, queue_context *queue_ctx) ++{ ++ if (!ctx->active_frame) ++ return NULL; ++ ++ /* Allow for a single frame to use at most half of the query pool. */ ++ uint32_t frame_idx = ringbuffer_index(ctx->frames, ctx->active_frame); ++ if (queue_ctx->submissions_per_frame[frame_idx] > MAX_QUERIES / 2) ++ return NULL; ++ ++ /* Check that the next query index has been reset properly: ++ * ++ * We use some double-buffering here in order to reduce the number of ++ * VkResetQueryPool commands. ++ * Return false if the next query-index allocation crosses into the half ++ * which still contains active queries, ++ */ ++ if (queue_ctx->queries.size > MAX_QUERIES / 2) { ++ struct query *last_query = ringbuffer_last(queue_ctx->queries); ++ uint32_t next_idx = ringbuffer_index(queue_ctx->queries, last_query) + 1; ++ if (next_idx == MAX_QUERIES || next_idx == MAX_QUERIES / 2) ++ return NULL; ++ } ++ ++ return ringbuffer_alloc(queue_ctx->queries); ++} ++ ++static bool ++get_commandbuffer(device_context *ctx, queue_context *queue_ctx, VkCommandBuffer *cmdbuffer) ++{ ++ uint64_t now = os_time_get_nano(); ++ ++ /* Begin critical section. */ ++ ringbuffer_lock(ctx->frames); ++ ringbuffer_lock(queue_ctx->queries); ++ struct query *query = allocate_query(ctx, queue_ctx); ++ if (query == NULL) { ++ ringbuffer_unlock(queue_ctx->queries); ++ ringbuffer_unlock(ctx->frames); ++ return false; ++ } ++ ++ query->submit_cpu_ts = now; ++ ++ /* Assign commandBuffer for timestamp. */ ++ *cmdbuffer = query->cmdbuffer; ++ ++ /* Increment timeline semaphore count. */ ++ queue_ctx->semaphore_value++; ++ ++ /* Add new submission entry for the current frame */ ++ assert(ctx->active_frame->state == FRAME_SUBMIT); ++ uint32_t frame_idx = ringbuffer_index(ctx->frames, ctx->active_frame); ++ queue_ctx->submissions_per_frame[frame_idx]++; ++ ++ ringbuffer_unlock(queue_ctx->queries); ++ ringbuffer_unlock(ctx->frames); ++ return true; ++} ++ ++static VkResult ++queue_submit2(device_context *ctx, VkQueue queue, uint32_t submitCount, ++ const VkSubmitInfo2 *pSubmits, VkFence fence, PFN_vkQueueSubmit2 queueSubmit2) ++{ ++ queue_context *queue_ctx = get_queue_context(ctx, queue); ++ if (!ctx->active_frame || !queue_ctx) ++ return queueSubmit2(queue, submitCount, pSubmits, fence); ++ ++ int first = -1; ++ VkCommandBuffer timestamp_cmdbuffer; ++ /* Check if any submission contains commandbuffers. */ ++ for (unsigned i = 0; i < submitCount; i++) { ++ if (pSubmits[i].commandBufferInfoCount) { ++ first = i; ++ break; ++ } ++ } ++ ++ /* Get timestamp commandbuffer. */ ++ if (first == -1 || !get_commandbuffer(ctx, queue_ctx, ×tamp_cmdbuffer)) ++ return queueSubmit2(queue, submitCount, pSubmits, fence); ++ ++ VkSubmitInfo2 *submits; ++ VkCommandBufferSubmitInfo *cmdbuffers; ++ VkSemaphoreSubmitInfo *semaphores; ++ VK_MULTIALLOC(ma); ++ vk_multialloc_add(&ma, &submits, VkSubmitInfo2, submitCount); ++ vk_multialloc_add(&ma, &cmdbuffers, VkCommandBufferSubmitInfo, ++ pSubmits[first].commandBufferInfoCount + 1); ++ vk_multialloc_add(&ma, &semaphores, VkSemaphoreSubmitInfo, ++ pSubmits[first].signalSemaphoreInfoCount + 1); ++ void *buf = vk_multialloc_zalloc(&ma, &ctx->alloc, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); ++ if (!buf) ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ ++ memcpy(submits, pSubmits, sizeof(VkSubmitInfo2) * submitCount); ++ VkSubmitInfo2 *submit_info = &submits[first]; ++ ++ /* Add commandbuffer to submission. */ ++ cmdbuffers[0] = (VkCommandBufferSubmitInfo){ ++ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, ++ .commandBuffer = timestamp_cmdbuffer, ++ }; ++ memcpy(&cmdbuffers[1], submit_info->pCommandBufferInfos, ++ sizeof(VkCommandBufferSubmitInfo) * submit_info->commandBufferInfoCount); ++ submit_info->pCommandBufferInfos = cmdbuffers; ++ submit_info->commandBufferInfoCount++; ++ ++ /* Add timeline semaphore to submission. */ ++ memcpy(semaphores, submit_info->pSignalSemaphoreInfos, ++ sizeof(VkSemaphoreSubmitInfo) * submit_info->signalSemaphoreInfoCount); ++ semaphores[submit_info->signalSemaphoreInfoCount] = (VkSemaphoreSubmitInfo){ ++ .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, ++ .semaphore = queue_ctx->semaphore, ++ .value = queue_ctx->semaphore_value, ++ .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, ++ }; ++ submit_info->pSignalSemaphoreInfos = semaphores; ++ submit_info->signalSemaphoreInfoCount++; ++ ++ /* Submit with added timestamp query commandbuffer. */ ++ VkResult res = queueSubmit2(queue, submitCount, submits, fence); ++ vk_free(&ctx->alloc, submits); ++ return res; ++} ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_QueueSubmit2KHR(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, ++ VkFence fence) ++{ ++ device_context *ctx = get_device_context(queue); ++ return queue_submit2(ctx, queue, submitCount, pSubmits, fence, ctx->vtable.QueueSubmit2KHR); ++} ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_QueueSubmit2(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, ++ VkFence fence) ++{ ++ device_context *ctx = get_device_context(queue); ++ return queue_submit2(ctx, queue, submitCount, pSubmits, fence, ctx->vtable.QueueSubmit2); ++} ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, ++ VkFence fence) ++{ ++ device_context *ctx = get_device_context(queue); ++ queue_context *queue_ctx = get_queue_context(ctx, queue); ++ if (!ctx->active_frame || !queue_ctx) ++ return ctx->vtable.QueueSubmit(queue, submitCount, pSubmits, fence); ++ ++ int first = -1; ++ VkCommandBuffer timestamp_cmdbuffer; ++ /* Check if any submission contains commandbuffers. */ ++ for (unsigned i = 0; i < submitCount; i++) { ++ if (pSubmits[i].commandBufferCount) { ++ first = i; ++ break; ++ } ++ } ++ ++ /* Get timestamp commandbuffer. */ ++ if (first == -1 || !get_commandbuffer(ctx, queue_ctx, ×tamp_cmdbuffer)) ++ return ctx->vtable.QueueSubmit(queue, submitCount, pSubmits, fence); ++ ++ VkSubmitInfo *submits; ++ VkCommandBuffer *cmdbuffers; ++ VkSemaphore *semaphores; ++ VkTimelineSemaphoreSubmitInfo *semaphore_info; ++ uint64_t *semaphore_values; ++ VK_MULTIALLOC(ma); ++ vk_multialloc_add(&ma, &submits, VkSubmitInfo, submitCount); ++ vk_multialloc_add(&ma, &cmdbuffers, VkCommandBuffer, pSubmits[first].commandBufferCount + 1); ++ vk_multialloc_add(&ma, &semaphores, VkSemaphore, pSubmits[first].signalSemaphoreCount + 1); ++ vk_multialloc_add(&ma, &semaphore_info, VkTimelineSemaphoreSubmitInfo, 1); ++ vk_multialloc_add(&ma, &semaphore_values, uint64_t, pSubmits[first].signalSemaphoreCount + 1); ++ void *buf = vk_multialloc_zalloc(&ma, &ctx->alloc, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); ++ if (!buf) ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ ++ memcpy(submits, pSubmits, sizeof(VkSubmitInfo) * submitCount); ++ VkSubmitInfo *submit_info = &submits[first]; ++ ++ /* Add commandbuffer to submission. */ ++ cmdbuffers[0] = timestamp_cmdbuffer; ++ memcpy(&cmdbuffers[1], submit_info->pCommandBuffers, ++ sizeof(VkCommandBuffer) * submit_info->commandBufferCount); ++ submit_info->pCommandBuffers = cmdbuffers; ++ submit_info->commandBufferCount++; ++ ++ /* Add timeline semaphore to submission. */ ++ const VkTimelineSemaphoreSubmitInfo *tlssi = ++ vk_find_struct_const(pSubmits[first].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO); ++ semaphores[0] = queue_ctx->semaphore; ++ memcpy(&semaphores[1], submit_info->pSignalSemaphores, ++ sizeof(VkSemaphore) * submit_info->signalSemaphoreCount); ++ submit_info->pSignalSemaphores = semaphores; ++ submit_info->signalSemaphoreCount++; ++ semaphore_values[0] = queue_ctx->semaphore_value; ++ if (tlssi) { ++ *semaphore_info = *tlssi; /* save original values */ ++ memcpy(&semaphore_values[1], tlssi->pSignalSemaphoreValues, ++ sizeof(uint64_t) * tlssi->signalSemaphoreValueCount); ++ ((VkTimelineSemaphoreSubmitInfo *)tlssi)->pSignalSemaphoreValues = semaphore_values; ++ ((VkTimelineSemaphoreSubmitInfo *)tlssi)->signalSemaphoreValueCount = ++ submit_info->signalSemaphoreCount; ++ } else { ++ *semaphore_info = (VkTimelineSemaphoreSubmitInfo){ ++ .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, ++ .pNext = submit_info->pNext, ++ .signalSemaphoreValueCount = submit_info->signalSemaphoreCount, ++ .pSignalSemaphoreValues = semaphore_values, ++ }; ++ submit_info->pNext = semaphore_info; ++ } ++ ++ /* Submit with added timestamp query commandbuffer. */ ++ VkResult res = ctx->vtable.QueueSubmit(queue, submitCount, submits, fence); ++ if (tlssi) ++ *(VkTimelineSemaphoreSubmitInfo *)tlssi = *semaphore_info; /* restore */ ++ vk_free(&ctx->alloc, buf); ++ return res; ++} +diff --git a/src/vulkan/anti-lag-layer/anti_lag_layer.h b/src/vulkan/anti-lag-layer/anti_lag_layer.h +new file mode 100644 +index 00000000000..31abb0f9aee +--- /dev/null ++++ b/src/vulkan/anti-lag-layer/anti_lag_layer.h +@@ -0,0 +1,111 @@ ++/* ++ * Copyright © 2025 Valve Corporation ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#ifndef ANTI_LAG_LAYER_H ++#define ANTI_LAG_LAYER_H ++ ++#include "util/simple_mtx.h" ++#include "vulkan/vk_layer.h" ++#include "vulkan/vulkan_core.h" ++#include "ringbuffer.h" ++ ++#define MAX_FRAMES 8 ++#define MAX_QUERIES 256 ++ ++enum frame_state { ++ FRAME_INVALID = 0, ++ FRAME_INPUT, /* Frame is in input stage. */ ++ FRAME_SUBMIT, /* All current queueSubmit calls are associated with this frame. */ ++ FRAME_PRESENT, /* Frame is in present stage and latencies can be evaluated. */ ++}; ++ ++typedef struct frame { ++ uint64_t frame_idx; ++ uint64_t frame_start_time; ++ uint64_t min_delay; ++ uint64_t imposed_delay; ++ enum frame_state state; ++} frame; ++ ++struct query { ++ uint64_t begin_gpu_ts; ++ uint64_t submit_cpu_ts; ++ VkCommandBuffer cmdbuffer; ++}; ++ ++typedef struct queue_context { ++ VkQueue queue; ++ uint32_t queue_family_idx; ++ VkCommandPool cmdPool; ++ VkQueryPool queryPool; ++ VkSemaphore semaphore; ++ uint64_t semaphore_value; ++ uint8_t submissions_per_frame[MAX_FRAMES]; ++ RINGBUFFER_DECLARE(queries, struct query, MAX_QUERIES); ++} queue_context; ++ ++typedef struct device_context { ++ ++ struct DeviceDispatchTable { ++#define DECLARE_HOOK(fn) PFN_vk##fn fn ++ DECLARE_HOOK(GetDeviceProcAddr); ++ DECLARE_HOOK(SetDeviceLoaderData); ++ DECLARE_HOOK(DestroyDevice); ++ DECLARE_HOOK(QueueSubmit); ++ DECLARE_HOOK(QueueSubmit2); ++ DECLARE_HOOK(QueueSubmit2KHR); ++ DECLARE_HOOK(GetDeviceQueue); ++ DECLARE_HOOK(CreateCommandPool); ++ DECLARE_HOOK(DestroyCommandPool); ++ DECLARE_HOOK(CreateQueryPool); ++ DECLARE_HOOK(ResetQueryPool); ++ DECLARE_HOOK(DestroyQueryPool); ++ DECLARE_HOOK(GetQueryPoolResults); ++ DECLARE_HOOK(AllocateCommandBuffers); ++ DECLARE_HOOK(FreeCommandBuffers); ++ DECLARE_HOOK(BeginCommandBuffer); ++ DECLARE_HOOK(EndCommandBuffer); ++ DECLARE_HOOK(GetCalibratedTimestampsKHR); ++ DECLARE_HOOK(CmdWriteTimestamp); ++ DECLARE_HOOK(CreateSemaphore); ++ DECLARE_HOOK(DestroySemaphore); ++ DECLARE_HOOK(GetSemaphoreCounterValue); ++ DECLARE_HOOK(WaitSemaphores); ++#undef DECLARE_HOOK ++ } vtable; ++ ++ VkDevice device; ++ VkAllocationCallbacks alloc; ++ simple_mtx_t mtx; ++ ++ struct { ++ int64_t delta; ++ uint64_t recalibrate_when; ++ float timestamp_period; ++ } calibration; ++ ++ RINGBUFFER_DECLARE(frames, frame, MAX_FRAMES); ++ frame *active_frame; ++ int64_t base_delay; ++ int64_t adaptation; ++ ++ unsigned num_queues; ++ queue_context queues[]; ++} device_context; ++ ++device_context *get_device_context(const void *object); ++ ++void anti_lag_AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD *pData); ++VkResult anti_lag_QueueSubmit2KHR(VkQueue queue, uint32_t submitCount, ++ const VkSubmitInfo2 *pSubmits, VkFence fence); ++VkResult anti_lag_QueueSubmit2(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, ++ VkFence fence); ++VkResult anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, ++ VkFence fence); ++ ++VkResult anti_lag_NegotiateLoaderLayerInterfaceVersion(VkNegotiateLayerInterface *pVersionStruct); ++ ++#endif /* ANTI_LAG_LAYER_H */ +diff --git a/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c b/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c +new file mode 100644 +index 00000000000..d2ca4a7dd44 +--- /dev/null ++++ b/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c +@@ -0,0 +1,899 @@ ++/* ++ * Copyright © 2025 Valve Corporation ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "util/simple_mtx.h" ++#include "vulkan/vk_layer.h" ++#include "vulkan/vulkan_core.h" ++#include "anti_lag_layer.h" ++#include "vk_alloc.h" ++#include "vk_util.h" ++ ++static uintptr_t ++object_to_key(const void *object) ++{ ++ return (uintptr_t)*(uintptr_t *)object; ++} ++ ++typedef struct instance_data { ++ struct InstanceDispatchTable { ++#define DECLARE_HOOK(fn) PFN_vk##fn fn ++ DECLARE_HOOK(GetInstanceProcAddr); ++ DECLARE_HOOK(CreateInstance); ++ DECLARE_HOOK(DestroyInstance); ++ DECLARE_HOOK(CreateDevice); ++ DECLARE_HOOK(EnumerateDeviceExtensionProperties); ++ DECLARE_HOOK(GetPhysicalDeviceFeatures2KHR); ++ DECLARE_HOOK(GetPhysicalDeviceFeatures2); ++ DECLARE_HOOK(GetPhysicalDeviceProperties); ++ DECLARE_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsEXT); ++ DECLARE_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsKHR); ++ DECLARE_HOOK(GetPhysicalDeviceQueueFamilyProperties); ++#undef DECLARE_HOOK ++ } vtable; ++ ++ VkInstance instance; ++ uint32_t apiVersion; ++ VkAllocationCallbacks alloc; ++ struct instance_data *next; ++} instance_data; ++ ++static void ++init_instance_vtable(instance_data *ctx, PFN_vkGetInstanceProcAddr gpa) ++{ ++ ctx->vtable.GetInstanceProcAddr = gpa; ++#define INIT_HOOK(fn) ctx->vtable.fn = (PFN_vk##fn)gpa(ctx->instance, "vk" #fn) ++ INIT_HOOK(CreateInstance); ++ INIT_HOOK(DestroyInstance); ++ INIT_HOOK(CreateDevice); ++ INIT_HOOK(EnumerateDeviceExtensionProperties); ++ INIT_HOOK(GetPhysicalDeviceFeatures2KHR); ++ INIT_HOOK(GetPhysicalDeviceFeatures2); ++ INIT_HOOK(GetPhysicalDeviceProperties); ++ INIT_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsEXT); ++ INIT_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsKHR); ++ INIT_HOOK(GetPhysicalDeviceQueueFamilyProperties); ++#undef INIT_HOOK ++} ++ ++static simple_mtx_t instance_mtx = SIMPLE_MTX_INITIALIZER; ++static instance_data *instance_list = NULL; ++ ++static void ++add_instance(instance_data *instance) ++{ ++ simple_mtx_lock(&instance_mtx); ++ instance_data **ptr = &instance_list; ++ while (*ptr != NULL) ++ ptr = &(*ptr)->next; ++ *ptr = instance; ++ simple_mtx_unlock(&instance_mtx); ++} ++ ++static instance_data * ++remove_instance(const void *object) ++{ ++ uintptr_t key = object_to_key(object); ++ simple_mtx_lock(&instance_mtx); ++ instance_data **ptr = &instance_list; ++ while (*ptr && key != object_to_key((*ptr)->instance)) ++ ptr = &(*ptr)->next; ++ ++ instance_data *ctx = *ptr; ++ *ptr = ctx ? ctx->next : NULL; ++ simple_mtx_unlock(&instance_mtx); ++ return ctx; ++} ++ ++static instance_data * ++get_instance_data(const void *object) ++{ ++ uintptr_t key = object_to_key(object); ++ simple_mtx_lock(&instance_mtx); ++ instance_data *ctx = instance_list; ++ while (ctx && key != object_to_key(ctx->instance)) ++ ctx = ctx->next; ++ simple_mtx_unlock(&instance_mtx); ++ return ctx; ++} ++ ++static VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, ++ const VkAllocationCallbacks *pAllocator, VkInstance *pInstance) ++{ ++ VkLayerInstanceCreateInfo *chain_info = (VkLayerInstanceCreateInfo *)(pCreateInfo->pNext); ++ while (chain_info && !(chain_info->sType == VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO && ++ chain_info->function == VK_LAYER_LINK_INFO)) { ++ chain_info = (VkLayerInstanceCreateInfo *)(chain_info->pNext); ++ } ++ ++ assert(chain_info && chain_info->u.pLayerInfo); ++ PFN_vkGetInstanceProcAddr fpGetInstanceProcAddr = ++ chain_info->u.pLayerInfo->pfnNextGetInstanceProcAddr; ++ PFN_vkCreateInstance fpCreateInstance = ++ (PFN_vkCreateInstance)fpGetInstanceProcAddr(NULL, "vkCreateInstance"); ++ if (fpCreateInstance == NULL) ++ return VK_ERROR_INITIALIZATION_FAILED; ++ ++ /* Advance the link info for the next element on the chain. */ ++ chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext; ++ ++ /* Create Instance. */ ++ VkResult result = fpCreateInstance(pCreateInfo, pAllocator, pInstance); ++ if (result != VK_SUCCESS) ++ return result; ++ ++ /* Create Instance context. */ ++ const VkAllocationCallbacks *alloc = pAllocator ? pAllocator : vk_default_allocator(); ++ void *buf = vk_alloc(alloc, sizeof(instance_data), alignof(instance_data), ++ VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); ++ if (!buf) { ++ PFN_vkDestroyInstance fpDestroyInstance = ++ (PFN_vkDestroyInstance)fpGetInstanceProcAddr(*pInstance, "vkDestroyInstance"); ++ fpDestroyInstance(*pInstance, alloc); ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ } ++ instance_data *ctx = (instance_data *)buf; ++ ctx->apiVersion = pCreateInfo->pApplicationInfo && pCreateInfo->pApplicationInfo->apiVersion ++ ? pCreateInfo->pApplicationInfo->apiVersion ++ : VK_API_VERSION_1_0; ++ ctx->instance = *pInstance; ++ ctx->alloc = *alloc; ++ ctx->next = NULL; ++ init_instance_vtable(ctx, fpGetInstanceProcAddr); ++ add_instance(ctx); ++ ++ return VK_SUCCESS; ++} ++ ++static VKAPI_ATTR void VKAPI_CALL ++anti_lag_DestroyInstance(VkInstance instance, const VkAllocationCallbacks *pAllocator) ++{ ++ instance_data *ctx = remove_instance(instance); ++ if (ctx) { ++ ctx->vtable.DestroyInstance(instance, pAllocator); ++ vk_free(&ctx->alloc, ctx); ++ } ++} ++ ++typedef struct device_data { ++ VkDevice device; ++ PFN_vkGetDeviceProcAddr GetDeviceProcAddr; ++ device_context *ctx; /* NULL if anti-lag ext is not enabled. */ ++ struct device_data *next; ++} device_data; ++ ++static void ++init_device_vtable(device_context *ctx, PFN_vkGetDeviceProcAddr gpa, PFN_vkSetDeviceLoaderData sld, ++ bool calibrated_timestamps_khr, bool host_query_reset_ext, ++ bool timeline_semaphore_khr) ++{ ++ ctx->vtable.GetDeviceProcAddr = gpa; ++ ctx->vtable.SetDeviceLoaderData = sld; ++#define INIT_HOOK(fn) ctx->vtable.fn = (PFN_vk##fn)gpa(ctx->device, "vk" #fn) ++#define INIT_HOOK_ALIAS(fn, alias, cond) \ ++ ctx->vtable.fn = (PFN_vk##fn)gpa(ctx->device, cond ? "vk" #alias : "vk" #fn) ++ INIT_HOOK(DestroyDevice); ++ INIT_HOOK(QueueSubmit); ++ INIT_HOOK(QueueSubmit2); ++ INIT_HOOK(QueueSubmit2KHR); ++ INIT_HOOK(GetDeviceQueue); ++ INIT_HOOK(CreateCommandPool); ++ INIT_HOOK(DestroyCommandPool); ++ INIT_HOOK(CreateQueryPool); ++ INIT_HOOK_ALIAS(ResetQueryPool, ResetQueryPoolEXT, host_query_reset_ext); ++ INIT_HOOK(DestroyQueryPool); ++ INIT_HOOK(GetQueryPoolResults); ++ INIT_HOOK(AllocateCommandBuffers); ++ INIT_HOOK(FreeCommandBuffers); ++ INIT_HOOK(BeginCommandBuffer); ++ INIT_HOOK(EndCommandBuffer); ++ INIT_HOOK_ALIAS(GetCalibratedTimestampsKHR, GetCalibratedTimestampsEXT, !calibrated_timestamps_khr); ++ INIT_HOOK(CmdWriteTimestamp); ++ INIT_HOOK(CreateSemaphore); ++ INIT_HOOK(DestroySemaphore); ++ INIT_HOOK_ALIAS(GetSemaphoreCounterValue, GetSemaphoreCounterValueKHR, timeline_semaphore_khr); ++ INIT_HOOK_ALIAS(WaitSemaphores, WaitSemaphoresKHR, timeline_semaphore_khr); ++#undef INIT_HOOK ++#undef INIT_HOOK_ALIAS ++} ++ ++static simple_mtx_t device_mtx = SIMPLE_MTX_INITIALIZER; ++static device_data *device_list = NULL; ++ ++static void ++add_device(device_data *device) ++{ ++ simple_mtx_lock(&device_mtx); ++ device_data **ptr = &device_list; ++ while (*ptr != NULL) ++ ptr = &(*ptr)->next; ++ *ptr = device; ++ simple_mtx_unlock(&device_mtx); ++} ++ ++static device_data * ++remove_device(const void *object) ++{ ++ uintptr_t key = object_to_key(object); ++ simple_mtx_lock(&device_mtx); ++ device_data **ptr = &device_list; ++ while (*ptr && key != object_to_key((*ptr)->device)) ++ ptr = &(*ptr)->next; ++ ++ device_data *ctx = *ptr; ++ *ptr = ctx ? ctx->next : NULL; ++ simple_mtx_unlock(&device_mtx); ++ return ctx; ++} ++ ++static device_data * ++get_device_data(const void *object) ++{ ++ uintptr_t key = object_to_key(object); ++ simple_mtx_lock(&device_mtx); ++ device_data *ctx = device_list; ++ while (ctx && key != object_to_key(ctx->device)) ++ ctx = ctx->next; ++ simple_mtx_unlock(&device_mtx); ++ return ctx; ++} ++ ++device_context * ++get_device_context(const void *object) ++{ ++ device_data *data = get_device_data(object); ++ assert(data && data->ctx); ++ return data->ctx; ++} ++ ++static VkLayerDeviceCreateInfo * ++get_device_chain_info(const VkDeviceCreateInfo *pCreateInfo, VkLayerFunction func) ++{ ++ vk_foreach_struct_const (item, pCreateInfo->pNext) { ++ if (item->sType == VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO && ++ ((VkLayerDeviceCreateInfo *)item)->function == func) ++ return (VkLayerDeviceCreateInfo *)item; ++ } ++ return NULL; ++} ++ ++static bool ++should_enable_layer(instance_data *ctx, VkPhysicalDevice physicalDevice, ++ VkPhysicalDeviceAntiLagFeaturesAMD ext_feature) ++{ ++ /* The extension is not requested by the application. */ ++ if (!ext_feature.antiLag) ++ return false; ++ ++ /* Ensure that the underlying implementation does not expose VK_AMD_anti_lag itself. */ ++ ext_feature.antiLag = false; ++ VkPhysicalDeviceFeatures2 features = { ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, ++ .pNext = &ext_feature, ++ }; ++ ++ if (ctx->vtable.GetPhysicalDeviceFeatures2KHR) { ++ ctx->vtable.GetPhysicalDeviceFeatures2KHR(physicalDevice, &features); ++ return !ext_feature.antiLag; ++ } ++ ++ if (ctx->vtable.GetPhysicalDeviceFeatures2) { ++ ctx->vtable.GetPhysicalDeviceFeatures2(physicalDevice, &features); ++ return !ext_feature.antiLag; ++ } ++ ++ return false; ++} ++ ++static bool ++check_calibrated_timestamps(instance_data *data, VkPhysicalDevice physicalDevice, bool *has_khr) ++{ ++ VkResult res; ++ uint32_t count = 0; ++ res = data->vtable.EnumerateDeviceExtensionProperties(physicalDevice, NULL, &count, NULL); ++ VkExtensionProperties *extensions = ++ vk_alloc(&data->alloc, count * sizeof(VkExtensionProperties), alignof(VkExtensionProperties), ++ VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); ++ if (!extensions) ++ return false; ++ ++ res |= data->vtable.EnumerateDeviceExtensionProperties(physicalDevice, NULL, &count, extensions); ++ ++ *has_khr = false; ++ bool has_ext = false; ++ if (res == VK_SUCCESS) { ++ for (unsigned i = 0; i < count; i++) { ++ if (strcmp(extensions[i].extensionName, VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0) ++ *has_khr = true; ++ if (strcmp(extensions[i].extensionName, VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0) ++ has_ext = true; ++ } ++ } ++ ++ vk_free(&data->alloc, extensions); ++ return *has_khr || has_ext; ++} ++ ++/* Initialize per-queue context: ++ * ++ * This includes creating one CommandPool and one QueryPool per Queue as well as ++ * recording one CommandBuffer per timestamp query. ++ */ ++static VkResult ++init_queue_context(device_context *ctx, queue_context *queue_ctx) ++{ ++#define CHECK_RESULT(res, label) \ ++ if (res != VK_SUCCESS) { \ ++ goto label; \ ++ } ++ ++ VkResult result; ++ ++ /* Create command pool */ ++ struct VkCommandPoolCreateInfo pool_info = { ++ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, ++ .pNext = NULL, ++ .flags = 0, ++ .queueFamilyIndex = queue_ctx->queue_family_idx, ++ }; ++ result = ++ ctx->vtable.CreateCommandPool(ctx->device, &pool_info, &ctx->alloc, &queue_ctx->cmdPool); ++ CHECK_RESULT(result, fail_cmdpool) ++ ++ /* Create query pool */ ++ VkQueryPoolCreateInfo query_pool_info = { ++ .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, ++ .queryType = VK_QUERY_TYPE_TIMESTAMP, ++ .queryCount = MAX_QUERIES, ++ }; ++ result = ctx->vtable.CreateQueryPool(ctx->device, &query_pool_info, &ctx->alloc, ++ &queue_ctx->queryPool); ++ CHECK_RESULT(result, fail_querypool) ++ ctx->vtable.ResetQueryPool(ctx->device, queue_ctx->queryPool, 0, MAX_QUERIES); ++ ringbuffer_init(queue_ctx->queries); ++ ++ /* Create timeline semaphore */ ++ VkSemaphoreTypeCreateInfo timelineCreateInfo = { ++ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, ++ .pNext = NULL, ++ .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE, ++ .initialValue = 0, ++ }; ++ VkSemaphoreCreateInfo createInfo = { ++ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, ++ .pNext = &timelineCreateInfo, ++ .flags = 0, ++ }; ++ result = ++ ctx->vtable.CreateSemaphore(ctx->device, &createInfo, &ctx->alloc, &queue_ctx->semaphore); ++ CHECK_RESULT(result, fail_semaphore); ++ ++ for (unsigned j = 0; j < MAX_QUERIES; j++) { ++ struct query *query = &queue_ctx->queries.data[j]; ++ ++ /* Allocate commandBuffer for timestamp. */ ++ VkCommandBufferAllocateInfo buffer_info = { ++ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, ++ .commandPool = queue_ctx->cmdPool, ++ .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, ++ .commandBufferCount = 1, ++ }; ++ result = ctx->vtable.AllocateCommandBuffers(ctx->device, &buffer_info, &query->cmdbuffer); ++ CHECK_RESULT(result, fail) ++ result = ctx->vtable.SetDeviceLoaderData(ctx->device, query->cmdbuffer); ++ CHECK_RESULT(result, fail) ++ ++ /* Record commandbuffer. */ ++ VkCommandBufferBeginInfo beginInfo = { ++ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, ++ }; ++ ++ result = ctx->vtable.BeginCommandBuffer(query->cmdbuffer, &beginInfo); ++ CHECK_RESULT(result, fail) ++ ctx->vtable.CmdWriteTimestamp(query->cmdbuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, ++ queue_ctx->queryPool, j); ++ result = ctx->vtable.EndCommandBuffer(query->cmdbuffer); ++ CHECK_RESULT(result, fail) ++ } ++ ++#undef CHECK_RESULT ++ return result; ++ ++fail: ++ ctx->vtable.DestroySemaphore(ctx->device, queue_ctx->semaphore, &ctx->alloc); ++fail_semaphore: ++ ctx->vtable.DestroyQueryPool(ctx->device, queue_ctx->queryPool, &ctx->alloc); ++fail_querypool: ++ ctx->vtable.DestroyCommandPool(ctx->device, queue_ctx->cmdPool, &ctx->alloc); ++fail_cmdpool: ++ for (queue_context *qctx = ctx->queues; qctx != queue_ctx; qctx++) { ++ ctx->vtable.DestroyQueryPool(ctx->device, qctx->queryPool, &ctx->alloc); ++ ctx->vtable.DestroyCommandPool(ctx->device, qctx->cmdPool, &ctx->alloc); ++ } ++ ++ return result; ++} ++ ++static VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, ++ const VkAllocationCallbacks *pAllocator, VkDevice *pDevice) ++{ ++ instance_data *instance_ctx = get_instance_data(physicalDevice); ++ VkLayerDeviceCreateInfo *chain_info = get_device_chain_info(pCreateInfo, VK_LAYER_LINK_INFO); ++ PFN_vkGetDeviceProcAddr fpGetDeviceProcAddr = chain_info->u.pLayerInfo->pfnNextGetDeviceProcAddr; ++ PFN_vkGetInstanceProcAddr fpGetInstanceProcAddr = ++ chain_info->u.pLayerInfo->pfnNextGetInstanceProcAddr; ++ PFN_vkCreateDevice fpCreateDevice = ++ (PFN_vkCreateDevice)fpGetInstanceProcAddr(instance_ctx->instance, "vkCreateDevice"); ++ if (fpCreateDevice == NULL) ++ return VK_ERROR_INITIALIZATION_FAILED; ++ ++ /* Advance the link info for the next element on the chain. */ ++ chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext; ++ ++ const VkAllocationCallbacks *alloc = pAllocator ? pAllocator : &instance_ctx->alloc; ++ device_data *data; ++ VkResult result; ++ ++ /* Only allocate a context and add to dispatch if the extension is enabled. */ ++ const VkPhysicalDeviceAntiLagFeaturesAMD *ext_features = ++ vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD); ++ bool enable = ext_features && should_enable_layer(instance_ctx, physicalDevice, *ext_features); ++ if (enable) { ++ /* Count queues with sufficient timestamp valid bits. */ ++ // TODO: make it work with less than 64 valid bits ++ unsigned num_queue_families = 0; ++ unsigned num_queues = 0; ++ for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) ++ num_queue_families = ++ MAX2(num_queue_families, pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex + 1); ++ VkQueueFamilyProperties *queue_family_props = ++ vk_alloc(alloc, num_queue_families * sizeof(VkQueueFamilyProperties), ++ alignof(VkQueueFamilyProperties), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); ++ if (!queue_family_props) ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ ++ instance_ctx->vtable.GetPhysicalDeviceQueueFamilyProperties( ++ physicalDevice, &num_queue_families, queue_family_props); ++ for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { ++ uint32_t queue_family_idx = pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex; ++ if (queue_family_props[queue_family_idx].timestampValidBits == 64 && ++ (queue_family_props[queue_family_idx].queueFlags & ++ (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT))) { ++ num_queues += pCreateInfo->pQueueCreateInfos[i].queueCount; ++ } ++ } ++ ++ /* Allocate the context. */ ++ device_context *ctx; ++ queue_context *queues; ++ VK_MULTIALLOC(ma); ++ vk_multialloc_add(&ma, &data, device_data, 1); ++ vk_multialloc_add(&ma, &ctx, struct device_context, 1); ++ vk_multialloc_add(&ma, &queues, queue_context, num_queues); ++ void *buf = vk_multialloc_zalloc(&ma, alloc, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); ++ if (!buf) { ++ vk_free(alloc, queue_family_props); ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ } ++ ++ VkPhysicalDeviceProperties properties; ++ instance_ctx->vtable.GetPhysicalDeviceProperties(physicalDevice, &properties); ++ ++ /* Ensure that calibrated timestamps and host query reset extensions are enabled. */ ++ bool has_calibrated_timestamps = false; ++ bool has_calibrated_timestamps_khr = false; ++ bool has_vk12 = instance_ctx->apiVersion >= VK_API_VERSION_1_2 && ++ properties.apiVersion >= VK_API_VERSION_1_2; ++ bool has_host_query_reset = has_vk12; ++ bool has_host_query_reset_ext = false; ++ bool has_timeline_semaphore = has_vk12; ++ bool has_timeline_semaphore_khr = false; ++ for (unsigned i = 0; i < pCreateInfo->enabledExtensionCount; i++) { ++ if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], ++ VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0) ++ has_calibrated_timestamps = has_calibrated_timestamps_khr = true; ++ if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], ++ VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0) ++ has_calibrated_timestamps = true; ++ if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], ++ VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME) == 0) ++ has_host_query_reset = has_host_query_reset_ext = true; ++ if (strcmp(pCreateInfo->ppEnabledExtensionNames[i], ++ VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME) == 0) ++ has_timeline_semaphore = has_timeline_semaphore_khr = true; ++ } ++ ++ /* Add missing extensions. */ ++ VkDeviceCreateInfo create_info = *pCreateInfo; ++ const char **ext_names = NULL; ++ uint32_t num_extra_extensions = ++ !has_calibrated_timestamps + !has_host_query_reset + !has_timeline_semaphore; ++ if (num_extra_extensions) { ++ ext_names = vk_alloc( ++ alloc, (pCreateInfo->enabledExtensionCount + num_extra_extensions) * sizeof(char *), ++ alignof(char *), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); ++ if (!ext_names) { ++ result = VK_ERROR_OUT_OF_HOST_MEMORY; ++ goto fail; ++ } ++ ++ memcpy(ext_names, pCreateInfo->ppEnabledExtensionNames, ++ sizeof(char *) * pCreateInfo->enabledExtensionCount); ++ ++ if (!has_timeline_semaphore) { ++ has_timeline_semaphore_khr = true; ++ ext_names[create_info.enabledExtensionCount++] = ++ VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME; ++ } ++ if (!has_host_query_reset) { ++ has_host_query_reset_ext = true; ++ ext_names[create_info.enabledExtensionCount++] = VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME; ++ } ++ if (!has_calibrated_timestamps) { ++ check_calibrated_timestamps(instance_ctx, physicalDevice, ++ &has_calibrated_timestamps_khr); ++ ext_names[create_info.enabledExtensionCount++] = ++ has_calibrated_timestamps_khr ? VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME ++ : VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME; ++ } ++ create_info.ppEnabledExtensionNames = ext_names; ++ } ++ ++ /* Ensure that hostQueryReset feature is enabled. */ ++ const VkPhysicalDeviceVulkan12Features *vk12 = ++ vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_VULKAN_1_2_FEATURES); ++ const VkPhysicalDeviceHostQueryResetFeatures *query_reset = ++ vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES); ++ const VkPhysicalDeviceTimelineSemaphoreFeatures *timeline_semaphore = ++ vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES); ++ uint32_t prev_hostQueryReset; ++ uint32_t prev_timelineSemaphore; ++ if (vk12) { ++ prev_hostQueryReset = vk12->hostQueryReset; ++ prev_timelineSemaphore = vk12->timelineSemaphore; ++ ((VkPhysicalDeviceVulkan12Features *)vk12)->hostQueryReset = VK_TRUE; ++ ((VkPhysicalDeviceVulkan12Features *)vk12)->timelineSemaphore = VK_TRUE; ++ } else { ++ if (query_reset) { ++ prev_hostQueryReset = query_reset->hostQueryReset; ++ ((VkPhysicalDeviceHostQueryResetFeatures *)query_reset)->hostQueryReset = VK_TRUE; ++ } else { ++ VkPhysicalDeviceHostQueryResetFeatures *feat = ++ alloca(sizeof(VkPhysicalDeviceHostQueryResetFeatures)); ++ *feat = (VkPhysicalDeviceHostQueryResetFeatures){ ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES, ++ .pNext = (void *)create_info.pNext, ++ .hostQueryReset = VK_TRUE, ++ }; ++ create_info.pNext = feat; ++ } ++ if (timeline_semaphore) { ++ prev_timelineSemaphore = timeline_semaphore->timelineSemaphore; ++ ((VkPhysicalDeviceTimelineSemaphoreFeatures *)timeline_semaphore)->timelineSemaphore = ++ VK_TRUE; ++ } else { ++ VkPhysicalDeviceTimelineSemaphoreFeatures *feat = ++ alloca(sizeof(VkPhysicalDeviceTimelineSemaphoreFeatures)); ++ *feat = (VkPhysicalDeviceTimelineSemaphoreFeatures){ ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES, ++ .pNext = (void *)create_info.pNext, ++ .timelineSemaphore = VK_TRUE, ++ }; ++ create_info.pNext = feat; ++ } ++ } ++ ++ /* Create Device. */ ++ result = fpCreateDevice(physicalDevice, &create_info, pAllocator, pDevice); ++ ++ if (vk12) { ++ ((VkPhysicalDeviceVulkan12Features *)vk12)->hostQueryReset = prev_hostQueryReset; ++ ((VkPhysicalDeviceVulkan12Features *)vk12)->timelineSemaphore = prev_timelineSemaphore; ++ } else { ++ if (query_reset) ++ ((VkPhysicalDeviceHostQueryResetFeatures *)query_reset)->hostQueryReset = ++ prev_hostQueryReset; ++ if (timeline_semaphore) ++ ((VkPhysicalDeviceTimelineSemaphoreFeatures *)timeline_semaphore)->timelineSemaphore = ++ prev_timelineSemaphore; ++ } ++ if (ext_names) ++ vk_free(alloc, ext_names); ++ ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++ /* Initialize Context. */ ++ data->ctx = ctx; ++ ctx->device = *pDevice; ++ chain_info = get_device_chain_info(pCreateInfo, VK_LOADER_DATA_CALLBACK); ++ PFN_vkSetDeviceLoaderData fpSetDeviceLoaderData = ++ (PFN_vkSetDeviceLoaderData)chain_info->u.pfnSetDeviceLoaderData; ++ init_device_vtable(ctx, fpGetDeviceProcAddr, fpSetDeviceLoaderData, ++ has_calibrated_timestamps_khr, has_host_query_reset_ext, ++ has_timeline_semaphore_khr); ++ simple_mtx_init(&ctx->mtx, mtx_plain); ++ ctx->num_queues = num_queues; ++ ctx->alloc = *alloc; ++ ctx->calibration.timestamp_period = properties.limits.timestampPeriod; ++ ringbuffer_init(ctx->frames); ++ ++ /* Initialize Queue contexts. */ ++ unsigned idx = 0; ++ for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { ++ /* Skip queue families without sufficient timestamp valid bits. ++ * Also skip queue families which cannot do GRAPHICS or COMPUTE since they ++ * always heavily async in nature (DMA transfers and sparse for example). ++ * Video is also irrelvant here since it should never be a critical path ++ * in a game that wants anti-lag. */ ++ uint32_t queue_family_idx = pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex; ++ if (queue_family_props[queue_family_idx].timestampValidBits != 64 || ++ !(queue_family_props[queue_family_idx].queueFlags & ++ (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT))) ++ continue; ++ ++ for (unsigned j = 0; j < pCreateInfo->pQueueCreateInfos[i].queueCount; j++) { ++ VkQueue queue; ++ ctx->vtable.GetDeviceQueue(*pDevice, queue_family_idx, j, &queue); ++ ctx->queues[idx].queue = queue; ++ ctx->queues[idx].queue_family_idx = queue_family_idx; ++ result = init_queue_context(ctx, &ctx->queues[idx]); ++ idx++; ++ if (result != VK_SUCCESS) ++ goto fail; ++ } ++ } ++ assert(idx == num_queues); ++ fail: ++ vk_free(alloc, queue_family_props); ++ } else { ++ data = (device_data *)vk_alloc(alloc, sizeof(device_data), alignof(device_data), ++ VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); ++ if (!data) ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ result = fpCreateDevice(physicalDevice, pCreateInfo, pAllocator, pDevice); ++ data->ctx = NULL; ++ } ++ ++ if (result == VK_SUCCESS) { ++ data->device = *pDevice; ++ data->GetDeviceProcAddr = fpGetDeviceProcAddr; ++ data->next = NULL; ++ add_device(data); ++ } else { ++ vk_free(alloc, data); ++ } ++ ++ return result; ++} ++ ++static VKAPI_ATTR void VKAPI_CALL ++anti_lag_DestroyDevice(VkDevice pDevice, const VkAllocationCallbacks *pAllocator) ++{ ++ device_data *data = remove_device(pDevice); ++ assert(data && data->ctx); ++ device_context *ctx = data->ctx; ++ ++ /* Destroy per-queue context. ++ * The application must ensure that no work is active on the device. ++ */ ++ for (unsigned i = 0; i < ctx->num_queues; i++) { ++ queue_context *queue_ctx = &ctx->queues[i]; ++ ctx->vtable.DestroyQueryPool(ctx->device, queue_ctx->queryPool, &ctx->alloc); ++ ctx->vtable.DestroyCommandPool(ctx->device, queue_ctx->cmdPool, &ctx->alloc); ++ ctx->vtable.DestroySemaphore(ctx->device, queue_ctx->semaphore, &ctx->alloc); ++ } ++ ++ ctx->vtable.DestroyDevice(pDevice, pAllocator); ++ vk_free(&ctx->alloc, data); ++} ++ ++static bool ++is_anti_lag_supported(VkPhysicalDevice physicalDevice) ++{ ++ instance_data *data = get_instance_data(physicalDevice); ++ VkPhysicalDeviceProperties properties; ++ data->vtable.GetPhysicalDeviceProperties(physicalDevice, &properties); ++ if (properties.limits.timestampPeriod == 0.0 || !properties.limits.timestampComputeAndGraphics) ++ return false; ++ ++ /* Check whether calibrated timestamps are supported. */ ++ bool has_khr; ++ if (!check_calibrated_timestamps(data, physicalDevice, &has_khr)) ++ return false; ++ ++ /* Check whether timeline semaphores and host query reset are supported. */ ++ VkPhysicalDeviceTimelineSemaphoreFeatures timeline_semaphore = { ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES, ++ .timelineSemaphore = VK_FALSE, ++ }; ++ VkPhysicalDeviceHostQueryResetFeatures query_reset = { ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES, ++ .pNext = &timeline_semaphore, ++ .hostQueryReset = VK_FALSE, ++ }; ++ VkPhysicalDeviceFeatures2 features = { ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, ++ .pNext = &query_reset, ++ }; ++ if (data->vtable.GetPhysicalDeviceFeatures2KHR) ++ data->vtable.GetPhysicalDeviceFeatures2KHR(physicalDevice, &features); ++ else if (data->vtable.GetPhysicalDeviceFeatures2) ++ data->vtable.GetPhysicalDeviceFeatures2(physicalDevice, &features); ++ if (!timeline_semaphore.timelineSemaphore || !query_reset.hostQueryReset) ++ return false; ++ ++ /* Check that DEVICE and CLOCK_MONOTONIC time domains are available. */ ++ VkResult res; ++ uint32_t count = 0; ++ PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsKHR ctd = ++ has_khr ? data->vtable.GetPhysicalDeviceCalibrateableTimeDomainsKHR ++ : data->vtable.GetPhysicalDeviceCalibrateableTimeDomainsEXT; ++ res = ctd(physicalDevice, &count, NULL); ++ VkTimeDomainKHR *time_domains = alloca(count * sizeof(VkTimeDomainKHR)); ++ res |= ctd(physicalDevice, &count, time_domains); ++ if (res != VK_SUCCESS) ++ return false; ++ ++ bool has_device_domain = false; ++ bool has_host_domain = false; ++ for (unsigned i = 0; i < count; i++) { ++ has_device_domain |= time_domains[i] == VK_TIME_DOMAIN_DEVICE_KHR; ++ has_host_domain |= time_domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR; ++ } ++ ++ return has_device_domain && has_host_domain; ++} ++ ++static VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_EnumerateDeviceExtensionProperties(VkPhysicalDevice physicalDevice, const char *pLayerName, ++ uint32_t *pPropertyCount, ++ VkExtensionProperties *pProperties) ++{ ++ instance_data *instance_data = get_instance_data(physicalDevice); ++ ++ if (pLayerName && strcmp(pLayerName, "VK_LAYER_MESA_anti_lag") == 0) { ++ if (!is_anti_lag_supported(physicalDevice)) { ++ *pPropertyCount = 0; ++ return VK_SUCCESS; ++ } ++ ++ VK_OUTARRAY_MAKE_TYPED(VkExtensionProperties, out, pProperties, pPropertyCount); ++ vk_outarray_append_typed(VkExtensionProperties, &out, prop) ++ { ++ *prop = ++ (VkExtensionProperties){VK_AMD_ANTI_LAG_EXTENSION_NAME, VK_AMD_ANTI_LAG_SPEC_VERSION}; ++ } ++ return vk_outarray_status(&out); ++ } ++ ++ return instance_data->vtable.EnumerateDeviceExtensionProperties(physicalDevice, pLayerName, ++ pPropertyCount, pProperties); ++} ++ ++static VKAPI_ATTR void VKAPI_CALL ++anti_lag_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, ++ VkPhysicalDeviceFeatures2 *pFeatures) ++{ ++ instance_data *ctx = get_instance_data(physicalDevice); ++ ctx->vtable.GetPhysicalDeviceFeatures2(physicalDevice, pFeatures); ++ VkPhysicalDeviceAntiLagFeaturesAMD *anti_lag_features = ++ vk_find_struct(pFeatures->pNext, PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD); ++ ++ if (anti_lag_features) { ++ anti_lag_features->antiLag |= is_anti_lag_supported(physicalDevice); ++ } ++} ++ ++static VKAPI_ATTR void VKAPI_CALL ++anti_lag_GetPhysicalDeviceFeatures2KHR(VkPhysicalDevice physicalDevice, ++ VkPhysicalDeviceFeatures2 *pFeatures) ++{ ++ instance_data *ctx = get_instance_data(physicalDevice); ++ ctx->vtable.GetPhysicalDeviceFeatures2KHR(physicalDevice, pFeatures); ++ VkPhysicalDeviceAntiLagFeaturesAMD *anti_lag_features = ++ vk_find_struct(pFeatures->pNext, PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD); ++ ++ if (anti_lag_features) { ++ anti_lag_features->antiLag |= is_anti_lag_supported(physicalDevice); ++ } ++} ++ ++static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL ++anti_lag_GetInstanceProcAddr(VkInstance instance, const char *pName); ++ ++static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL ++anti_lag_GetDeviceProcAddr(VkDevice device, const char *pName); ++ ++#define ADD_HOOK(fn) {"vk" #fn, (PFN_vkVoidFunction)anti_lag_##fn} ++static const struct { ++ const char *name; ++ PFN_vkVoidFunction ptr; ++} instance_funcptr_map[] = { ++ ADD_HOOK(GetInstanceProcAddr), ++ ADD_HOOK(CreateInstance), ++ ADD_HOOK(DestroyInstance), ++ ADD_HOOK(EnumerateDeviceExtensionProperties), ++ ADD_HOOK(CreateDevice), ++ ADD_HOOK(GetPhysicalDeviceFeatures2), ++ ADD_HOOK(GetPhysicalDeviceFeatures2KHR), ++}; ++ ++static const struct { ++ const char *name; ++ PFN_vkVoidFunction ptr; ++} device_funcptr_map[] = { ++ ADD_HOOK(GetDeviceProcAddr), ++ ADD_HOOK(DestroyDevice), ++ ADD_HOOK(AntiLagUpdateAMD), ++ ADD_HOOK(QueueSubmit), ++ ADD_HOOK(QueueSubmit2), ++ ADD_HOOK(QueueSubmit2KHR), ++}; ++#undef ADD_HOOK ++ ++static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL ++anti_lag_GetInstanceProcAddr(VkInstance instance, const char *pName) ++{ ++ if (!pName) ++ return NULL; ++ ++ PFN_vkVoidFunction result = NULL; ++ if (instance) { ++ instance_data *ctx = get_instance_data(instance); ++ if (ctx) ++ result = ctx->vtable.GetInstanceProcAddr(instance, pName); ++ } ++ ++ /* Only hook instance functions which are exposed by the underlying impl. ++ * Ignore instance parameter for vkCreateInstance and vkCreateDevice. ++ */ ++ if (result || strcmp(pName, "vkCreateInstance") == 0 || strcmp(pName, "vkCreateDevice") == 0) { ++ for (uint32_t i = 0; i < ARRAY_SIZE(instance_funcptr_map); i++) { ++ if (strcmp(pName, instance_funcptr_map[i].name) == 0) ++ return instance_funcptr_map[i].ptr; ++ } ++ } ++ ++ return result; ++} ++ ++static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL ++anti_lag_GetDeviceProcAddr(VkDevice device, const char *pName) ++{ ++ if (!pName || !device) ++ return NULL; ++ ++ device_data *data = get_device_data(device); ++ PFN_vkVoidFunction result = data->GetDeviceProcAddr(device, pName); ++ ++ /* Only hook device functions if the Layer extension is enabled. */ ++ if (data->ctx && (result || strcmp(pName, "vkAntiLagUpdateAMD") == 0)) { ++ for (uint32_t i = 0; i < ARRAY_SIZE(device_funcptr_map); i++) { ++ if (strcmp(pName, device_funcptr_map[i].name) == 0) ++ return device_funcptr_map[i].ptr; ++ } ++ } ++ ++ return result; ++} ++ ++PUBLIC VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_NegotiateLoaderLayerInterfaceVersion(VkNegotiateLayerInterface *pVersionStruct) ++{ ++ assert(pVersionStruct != NULL); ++ assert(pVersionStruct->sType == LAYER_NEGOTIATE_INTERFACE_STRUCT); ++ ++ if (pVersionStruct->loaderLayerInterfaceVersion >= 2) { ++ pVersionStruct->loaderLayerInterfaceVersion = 2; ++ pVersionStruct->pfnGetInstanceProcAddr = anti_lag_GetInstanceProcAddr; ++ pVersionStruct->pfnGetDeviceProcAddr = anti_lag_GetDeviceProcAddr; ++ pVersionStruct->pfnGetPhysicalDeviceProcAddr = NULL; ++ } ++ ++ return VK_SUCCESS; ++} +diff --git a/src/vulkan/anti-lag-layer/meson.build b/src/vulkan/anti-lag-layer/meson.build +new file mode 100644 +index 00000000000..264c55c8e75 +--- /dev/null ++++ b/src/vulkan/anti-lag-layer/meson.build +@@ -0,0 +1,26 @@ ++# Copyright © 2025 Valve Corporation ++# SPDX-License-Identifier: MIT ++ ++vklayer_files = files( ++ 'anti_lag_layer.c', ++ 'anti_lag_layer_interface.c', ++) ++ ++shared_library( ++ 'VkLayer_MESA_anti_lag', ++ vklayer_files, ++ c_args : [no_override_init_args], ++ gnu_symbol_visibility : 'hidden', ++ dependencies : [ ++ idep_vulkan_util, idep_mesautil, ++ ], ++ include_directories : [inc_include, inc_util, inc_src], ++ link_args : cc.get_supported_link_arguments(['-Wl,-Bsymbolic-functions', '-Wl,-z,relro']), ++ install : true ++) ++ ++install_data( ++ files('VkLayer_MESA_anti_lag.json'), ++ install_dir : join_paths(get_option('datadir'), 'vulkan', 'implicit_layer.d'), ++ install_tag : 'runtime', ++) +diff --git a/src/vulkan/anti-lag-layer/ringbuffer.h b/src/vulkan/anti-lag-layer/ringbuffer.h +new file mode 100644 +index 00000000000..1747b7e720f +--- /dev/null ++++ b/src/vulkan/anti-lag-layer/ringbuffer.h +@@ -0,0 +1,58 @@ ++/* ++ * Copyright © 2025 Valve Corporation ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#ifndef RINGBUFFER_H ++#define RINGBUFFER_H ++ ++#include "util/macros.h" ++ ++#define RINGBUFFER_DECLARE(name, type, N) \ ++ struct { \ ++ type data[N]; \ ++ uint32_t head; \ ++ uint32_t tail; \ ++ uint32_t size; \ ++ simple_mtx_t mtx; \ ++ } name ++ ++#define ringbuffer_init(buffer) \ ++ (buffer.head = buffer.tail = buffer.size = 0, simple_mtx_init(&buffer.mtx, mtx_plain)) ++ ++#define ringbuffer_lock(buffer) simple_mtx_lock(&buffer.mtx) ++#define ringbuffer_unlock(buffer) simple_mtx_unlock(&buffer.mtx) ++ ++static inline uint32_t ++__ringbuffer_add_wrap(uint32_t *val, uint32_t *size, uint32_t N) ++{ ++ uint32_t prev = *val; ++ *val = (*val + 1) % N; ++ *size = *size + 1; ++ assert(*size <= N); ++ return prev; ++} ++ ++#define ringbuffer_alloc(buffer) \ ++ (buffer.size == ARRAY_SIZE(buffer.data) \ ++ ? NULL \ ++ : &buffer.data[__ringbuffer_add_wrap(&buffer.head, &buffer.size, ARRAY_SIZE(buffer.data))]) ++ ++#define ringbuffer_free(buffer, elem) \ ++ assert(elem == NULL || elem == &buffer.data[buffer.tail]); \ ++ buffer.size--; \ ++ assert(buffer.size < ARRAY_SIZE(buffer.data)); \ ++ buffer.tail = (buffer.tail + 1) % ARRAY_SIZE(buffer.data) ++ ++#define ringbuffer_first(buffer) (&buffer.data[buffer.tail]) ++ ++#define ringbuffer_last(buffer) \ ++ (&buffer.data[(buffer.head + ARRAY_SIZE(buffer.data) - 1) % ARRAY_SIZE(buffer.data)]) ++ ++#define ringbuffer_index(buffer, elem) (elem - buffer.data) ++ ++#define ringbuffer_next(buffer, elem) \ ++ (&buffer.data[(ringbuffer_index(buffer, elem) + 1) % ARRAY_SIZE(buffer.data)]) ++ ++#endif /* RINGBUFFER_H */ +diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build +index 3225b5f4a9d..cf62ecc6ae7 100644 +--- a/src/vulkan/meson.build ++++ b/src/vulkan/meson.build +@@ -98,3 +98,6 @@ endif + if with_vulkan_vram_report_limit_layer + subdir('vram-report-limit-layer') + endif ++if with_vulkan_anti_lag_layer ++ subdir('anti-lag-layer') ++endif +-- +2.50.1 + + +From e4adbbe12d9aafdaf80f340f685cf7bd7758d385 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= +Date: Thu, 30 May 2024 11:55:46 +0200 +Subject: [PATCH 07/11] util/time: add os_time_nanosleep_until() function + +Part-of: +--- + src/util/os_time.c | 16 +++++++++++++++- + src/util/os_time.h | 2 ++ + 2 files changed, 17 insertions(+), 1 deletion(-) + +diff --git a/src/util/os_time.c b/src/util/os_time.c +index da8ad7a80b8..209b7ae442c 100644 +--- a/src/util/os_time.c ++++ b/src/util/os_time.c +@@ -60,7 +60,21 @@ os_time_get_nano(void) + return ts.tv_nsec + ts.tv_sec*INT64_C(1000000000); + } + +- ++void ++os_time_nanosleep_until(int64_t deadline) ++{ ++#if DETECT_OS_LINUX || DETECT_OS_MANAGARM ++ struct timespec time; ++ time.tv_sec = deadline / INT64_C(1000000000); ++ time.tv_nsec = deadline % INT64_C(1000000000); ++ while (clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &time, &time) == EINTR); ++#else ++ int64_t duration = deadline - os_time_get_nano(); ++ if (duration > 0) { ++ os_time_sleep(duration / 1000); ++ } ++#endif ++} + + void + os_time_sleep(int64_t usecs) +diff --git a/src/util/os_time.h b/src/util/os_time.h +index 6ca37eac769..4217ff37b68 100644 +--- a/src/util/os_time.h ++++ b/src/util/os_time.h +@@ -74,6 +74,8 @@ os_localtime(const time_t *timer, struct tm *buf) + #endif + } + ++void ++os_time_nanosleep_until(int64_t deadline); + + /* + * Sleep. +-- +2.50.1 + + +From 22d1adddbaff70c62207396a12576329f477174f Mon Sep 17 00:00:00 2001 +From: Hans-Kristian Arntzen +Date: Thu, 26 Jun 2025 13:00:20 +0200 +Subject: [PATCH 08/11] anti-lag: Only consider timestamps from queues which + have presented. + +Avoids stray submissions to compute queues to nullify the delay. + +Signed-off-by: Hans-Kristian Arntzen +Part-of: +--- + src/vulkan/anti-lag-layer/anti_lag_layer.c | 24 ++++++++++++++++++- + src/vulkan/anti-lag-layer/anti_lag_layer.h | 3 +++ + .../anti-lag-layer/anti_lag_layer_interface.c | 2 ++ + 3 files changed, 28 insertions(+), 1 deletion(-) + +diff --git a/src/vulkan/anti-lag-layer/anti_lag_layer.c b/src/vulkan/anti-lag-layer/anti_lag_layer.c +index 6c21e074024..d7543a5dfd9 100644 +--- a/src/vulkan/anti-lag-layer/anti_lag_layer.c ++++ b/src/vulkan/anti-lag-layer/anti_lag_layer.c +@@ -8,6 +8,7 @@ + #include + #include "util/os_time.h" + #include "util/simple_mtx.h" ++#include "util/u_atomic.h" + #include "vulkan/vulkan_core.h" + #include "ringbuffer.h" + #include "vk_alloc.h" +@@ -400,7 +401,11 @@ get_commandbuffer(device_context *ctx, queue_context *queue_ctx, VkCommandBuffer + /* Begin critical section. */ + ringbuffer_lock(ctx->frames); + ringbuffer_lock(queue_ctx->queries); +- struct query *query = allocate_query(ctx, queue_ctx); ++ ++ /* Don't record timestamps for queues that are not deemed sensitive to latency. */ ++ struct query *query = ++ p_atomic_read(&queue_ctx->latency_sensitive) ? allocate_query(ctx, queue_ctx) : NULL; ++ + if (query == NULL) { + ringbuffer_unlock(queue_ctx->queries); + ringbuffer_unlock(ctx->frames); +@@ -588,3 +593,20 @@ anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pS + vk_free(&ctx->alloc, buf); + return res; + } ++ ++VKAPI_ATTR VkResult VKAPI_CALL ++anti_lag_QueuePresentKHR(VkQueue queue, const VkPresentInfoKHR *pPresentInfo) ++{ ++ /* When multiple queues are in flight, the min-delay approach ++ * has problems. An async compute queue could be submitted to ++ * with very low delay while the main graphics queue would be swamped with work. ++ * If we take a global min-delay over all queues, the algorithm would ++ * assume that there is very low delay and thus sleeps are disabled, but ++ * unless the graphics work depends directly on the async compute work, ++ * this is a false assumption. */ ++ device_context *ctx = get_device_context(queue); ++ queue_context *queue_ctx = get_queue_context(ctx, queue); ++ p_atomic_set(&queue_ctx->latency_sensitive, true); ++ ++ return ctx->vtable.QueuePresentKHR(queue, pPresentInfo); ++} +diff --git a/src/vulkan/anti-lag-layer/anti_lag_layer.h b/src/vulkan/anti-lag-layer/anti_lag_layer.h +index 31abb0f9aee..d03d246d79c 100644 +--- a/src/vulkan/anti-lag-layer/anti_lag_layer.h ++++ b/src/vulkan/anti-lag-layer/anti_lag_layer.h +@@ -39,6 +39,7 @@ struct query { + typedef struct queue_context { + VkQueue queue; + uint32_t queue_family_idx; ++ bool latency_sensitive; + VkCommandPool cmdPool; + VkQueryPool queryPool; + VkSemaphore semaphore; +@@ -74,6 +75,7 @@ typedef struct device_context { + DECLARE_HOOK(DestroySemaphore); + DECLARE_HOOK(GetSemaphoreCounterValue); + DECLARE_HOOK(WaitSemaphores); ++ DECLARE_HOOK(QueuePresentKHR); + #undef DECLARE_HOOK + } vtable; + +@@ -105,6 +107,7 @@ VkResult anti_lag_QueueSubmit2(VkQueue queue, uint32_t submitCount, const VkSubm + VkFence fence); + VkResult anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, + VkFence fence); ++VkResult anti_lag_QueuePresentKHR(VkQueue queue, const VkPresentInfoKHR *pPresentInfo); + + VkResult anti_lag_NegotiateLoaderLayerInterfaceVersion(VkNegotiateLayerInterface *pVersionStruct); + +diff --git a/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c b/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c +index d2ca4a7dd44..6a803e24fe6 100644 +--- a/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c ++++ b/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c +@@ -194,6 +194,7 @@ init_device_vtable(device_context *ctx, PFN_vkGetDeviceProcAddr gpa, PFN_vkSetDe + INIT_HOOK(CmdWriteTimestamp); + INIT_HOOK(CreateSemaphore); + INIT_HOOK(DestroySemaphore); ++ INIT_HOOK(QueuePresentKHR); + INIT_HOOK_ALIAS(GetSemaphoreCounterValue, GetSemaphoreCounterValueKHR, timeline_semaphore_khr); + INIT_HOOK_ALIAS(WaitSemaphores, WaitSemaphoresKHR, timeline_semaphore_khr); + #undef INIT_HOOK +@@ -833,6 +834,7 @@ static const struct { + ADD_HOOK(QueueSubmit), + ADD_HOOK(QueueSubmit2), + ADD_HOOK(QueueSubmit2KHR), ++ ADD_HOOK(QueuePresentKHR), + }; + #undef ADD_HOOK + +-- +2.50.1 + + +From be19fb7abf7dba7aaff2ff809a6a0a8f6ac68ce4 Mon Sep 17 00:00:00 2001 +From: Hans-Kristian Arntzen +Date: Thu, 26 Jun 2025 14:22:07 +0200 +Subject: [PATCH 09/11] anti-lag: Submit timestamps early in a frame. + +Allows detecting if the queue ends up going idle due to +a cross-queue dependency. Since we're only considering delays from +specific queues, we would not be able to detect low-latency situations +arising from the start of a frame happening on async queues. + +Until we observe real work happening for a queue in a frame context, +submit timestamps ahead of any other waits. + +Signed-off-by: Hans-Kristian Arntzen +Part-of: +--- + src/vulkan/anti-lag-layer/anti_lag_layer.c | 114 ++++++++++++++++----- + 1 file changed, 86 insertions(+), 28 deletions(-) + +diff --git a/src/vulkan/anti-lag-layer/anti_lag_layer.c b/src/vulkan/anti-lag-layer/anti_lag_layer.c +index d7543a5dfd9..f730ca00f9c 100644 +--- a/src/vulkan/anti-lag-layer/anti_lag_layer.c ++++ b/src/vulkan/anti-lag-layer/anti_lag_layer.c +@@ -366,13 +366,9 @@ get_queue_context(device_context *ctx, VkQueue queue) + } + + static struct query * +-allocate_query(device_context *ctx, queue_context *queue_ctx) ++allocate_query(queue_context *queue_ctx, uint32_t frame_idx) + { +- if (!ctx->active_frame) +- return NULL; +- + /* Allow for a single frame to use at most half of the query pool. */ +- uint32_t frame_idx = ringbuffer_index(ctx->frames, ctx->active_frame); + if (queue_ctx->submissions_per_frame[frame_idx] > MAX_QUERIES / 2) + return NULL; + +@@ -394,7 +390,8 @@ allocate_query(device_context *ctx, queue_context *queue_ctx) + } + + static bool +-get_commandbuffer(device_context *ctx, queue_context *queue_ctx, VkCommandBuffer *cmdbuffer) ++get_commandbuffer(device_context *ctx, queue_context *queue_ctx, VkCommandBuffer *cmdbuffer, ++ bool has_command_buffer, bool has_wait_before_cmdbuffer, bool *early_submit) + { + uint64_t now = os_time_get_nano(); + +@@ -403,8 +400,24 @@ get_commandbuffer(device_context *ctx, queue_context *queue_ctx, VkCommandBuffer + ringbuffer_lock(queue_ctx->queries); + + /* Don't record timestamps for queues that are not deemed sensitive to latency. */ +- struct query *query = +- p_atomic_read(&queue_ctx->latency_sensitive) ? allocate_query(ctx, queue_ctx) : NULL; ++ bool need_query = ctx->active_frame && p_atomic_read(&queue_ctx->latency_sensitive); ++ uint32_t frame_idx; ++ struct query *query = NULL; ++ ++ if (need_query) { ++ assert(ctx->active_frame->state == FRAME_SUBMIT); ++ frame_idx = ringbuffer_index(ctx->frames, ctx->active_frame); ++ ++ /* For the very first submissions in a frame (until we observe real GPU work happening), ++ * we would want to submit a timestamp before anything else, including waits. ++ * This allows us to detect a sensitive queue going idle before we can submit work to it. ++ * If the queue in question depends on semaphores from other unrelated queues, ++ * we may not easily be able to detect that situation without adding a lot more complexity. ++ */ ++ *early_submit = has_wait_before_cmdbuffer && queue_ctx->submissions_per_frame[frame_idx] == 0; ++ if (has_command_buffer || *early_submit) ++ query = allocate_query(queue_ctx, frame_idx); ++ } + + if (query == NULL) { + ringbuffer_unlock(queue_ctx->queries); +@@ -421,8 +434,6 @@ get_commandbuffer(device_context *ctx, queue_context *queue_ctx, VkCommandBuffer + queue_ctx->semaphore_value++; + + /* Add new submission entry for the current frame */ +- assert(ctx->active_frame->state == FRAME_SUBMIT); +- uint32_t frame_idx = ringbuffer_index(ctx->frames, ctx->active_frame); + queue_ctx->submissions_per_frame[frame_idx]++; + + ringbuffer_unlock(queue_ctx->queries); +@@ -435,13 +446,17 @@ queue_submit2(device_context *ctx, VkQueue queue, uint32_t submitCount, + const VkSubmitInfo2 *pSubmits, VkFence fence, PFN_vkQueueSubmit2 queueSubmit2) + { + queue_context *queue_ctx = get_queue_context(ctx, queue); +- if (!ctx->active_frame || !queue_ctx) ++ if (!ctx->active_frame || !queue_ctx || !submitCount) + return queueSubmit2(queue, submitCount, pSubmits, fence); + ++ bool has_wait_before_cmdbuffer = false; + int first = -1; + VkCommandBuffer timestamp_cmdbuffer; + /* Check if any submission contains commandbuffers. */ + for (unsigned i = 0; i < submitCount; i++) { ++ if (pSubmits[i].waitSemaphoreInfoCount != 0) ++ has_wait_before_cmdbuffer = true; ++ + if (pSubmits[i].commandBufferInfoCount) { + first = i; + break; +@@ -449,23 +464,42 @@ queue_submit2(device_context *ctx, VkQueue queue, uint32_t submitCount, + } + + /* Get timestamp commandbuffer. */ +- if (first == -1 || !get_commandbuffer(ctx, queue_ctx, ×tamp_cmdbuffer)) ++ bool early_submit; ++ if (!get_commandbuffer(ctx, queue_ctx, ×tamp_cmdbuffer, first >= 0, ++ has_wait_before_cmdbuffer, &early_submit)) { + return queueSubmit2(queue, submitCount, pSubmits, fence); ++ } + + VkSubmitInfo2 *submits; + VkCommandBufferSubmitInfo *cmdbuffers; + VkSemaphoreSubmitInfo *semaphores; + VK_MULTIALLOC(ma); +- vk_multialloc_add(&ma, &submits, VkSubmitInfo2, submitCount); +- vk_multialloc_add(&ma, &cmdbuffers, VkCommandBufferSubmitInfo, +- pSubmits[first].commandBufferInfoCount + 1); +- vk_multialloc_add(&ma, &semaphores, VkSemaphoreSubmitInfo, +- pSubmits[first].signalSemaphoreInfoCount + 1); ++ ++ if (early_submit) { ++ vk_multialloc_add(&ma, &submits, VkSubmitInfo2, submitCount + 1); ++ vk_multialloc_add(&ma, &cmdbuffers, VkCommandBufferSubmitInfo, 1); ++ vk_multialloc_add(&ma, &semaphores, VkSemaphoreSubmitInfo, 1); ++ first = 0; ++ } else { ++ vk_multialloc_add(&ma, &submits, VkSubmitInfo2, submitCount); ++ vk_multialloc_add(&ma, &cmdbuffers, VkCommandBufferSubmitInfo, ++ pSubmits[first].commandBufferInfoCount + 1); ++ vk_multialloc_add(&ma, &semaphores, VkSemaphoreSubmitInfo, ++ pSubmits[first].signalSemaphoreInfoCount + 1); ++ } ++ + void *buf = vk_multialloc_zalloc(&ma, &ctx->alloc, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!buf) + return VK_ERROR_OUT_OF_HOST_MEMORY; + +- memcpy(submits, pSubmits, sizeof(VkSubmitInfo2) * submitCount); ++ if (early_submit) { ++ memcpy(submits + 1, pSubmits, sizeof(VkSubmitInfo2) * submitCount); ++ submits[0] = (VkSubmitInfo2){.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2}; ++ submitCount++; ++ } else { ++ memcpy(submits, pSubmits, sizeof(VkSubmitInfo2) * submitCount); ++ } ++ + VkSubmitInfo2 *submit_info = &submits[first]; + + /* Add commandbuffer to submission. */ +@@ -518,13 +552,17 @@ anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pS + { + device_context *ctx = get_device_context(queue); + queue_context *queue_ctx = get_queue_context(ctx, queue); +- if (!ctx->active_frame || !queue_ctx) ++ if (!ctx->active_frame || !queue_ctx || !submitCount) + return ctx->vtable.QueueSubmit(queue, submitCount, pSubmits, fence); + ++ bool has_wait_before_cmdbuffer = false; + int first = -1; + VkCommandBuffer timestamp_cmdbuffer; +- /* Check if any submission contains commandbuffers. */ ++ /* Check if any submission contains commandbuffers or waits before those. */ + for (unsigned i = 0; i < submitCount; i++) { ++ if (pSubmits[i].waitSemaphoreCount != 0) ++ has_wait_before_cmdbuffer = true; ++ + if (pSubmits[i].commandBufferCount) { + first = i; + break; +@@ -532,8 +570,11 @@ anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pS + } + + /* Get timestamp commandbuffer. */ +- if (first == -1 || !get_commandbuffer(ctx, queue_ctx, ×tamp_cmdbuffer)) ++ bool early_submit; ++ if (!get_commandbuffer(ctx, queue_ctx, ×tamp_cmdbuffer, first >= 0, ++ has_wait_before_cmdbuffer, &early_submit)) { + return ctx->vtable.QueueSubmit(queue, submitCount, pSubmits, fence); ++ } + + VkSubmitInfo *submits; + VkCommandBuffer *cmdbuffers; +@@ -541,16 +582,33 @@ anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pS + VkTimelineSemaphoreSubmitInfo *semaphore_info; + uint64_t *semaphore_values; + VK_MULTIALLOC(ma); +- vk_multialloc_add(&ma, &submits, VkSubmitInfo, submitCount); +- vk_multialloc_add(&ma, &cmdbuffers, VkCommandBuffer, pSubmits[first].commandBufferCount + 1); +- vk_multialloc_add(&ma, &semaphores, VkSemaphore, pSubmits[first].signalSemaphoreCount + 1); +- vk_multialloc_add(&ma, &semaphore_info, VkTimelineSemaphoreSubmitInfo, 1); +- vk_multialloc_add(&ma, &semaphore_values, uint64_t, pSubmits[first].signalSemaphoreCount + 1); ++ ++ if (early_submit) { ++ vk_multialloc_add(&ma, &submits, VkSubmitInfo, submitCount + 1); ++ vk_multialloc_add(&ma, &cmdbuffers, VkCommandBuffer, 1); ++ vk_multialloc_add(&ma, &semaphores, VkSemaphore, 1); ++ vk_multialloc_add(&ma, &semaphore_info, VkTimelineSemaphoreSubmitInfo, 1); ++ vk_multialloc_add(&ma, &semaphore_values, uint64_t, 1); ++ first = 0; ++ } else { ++ vk_multialloc_add(&ma, &submits, VkSubmitInfo, submitCount); ++ vk_multialloc_add(&ma, &cmdbuffers, VkCommandBuffer, pSubmits[first].commandBufferCount + 1); ++ vk_multialloc_add(&ma, &semaphores, VkSemaphore, pSubmits[first].signalSemaphoreCount + 1); ++ vk_multialloc_add(&ma, &semaphore_info, VkTimelineSemaphoreSubmitInfo, 1); ++ vk_multialloc_add(&ma, &semaphore_values, uint64_t, pSubmits[first].signalSemaphoreCount + 1); ++ } + void *buf = vk_multialloc_zalloc(&ma, &ctx->alloc, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!buf) + return VK_ERROR_OUT_OF_HOST_MEMORY; + +- memcpy(submits, pSubmits, sizeof(VkSubmitInfo) * submitCount); ++ if (early_submit) { ++ memcpy(submits + 1, pSubmits, sizeof(VkSubmitInfo) * submitCount); ++ submits[0] = (VkSubmitInfo){.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO}; ++ submitCount++; ++ } else { ++ memcpy(submits, pSubmits, sizeof(VkSubmitInfo) * submitCount); ++ } ++ + VkSubmitInfo *submit_info = &submits[first]; + + /* Add commandbuffer to submission. */ +@@ -562,7 +620,7 @@ anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pS + + /* Add timeline semaphore to submission. */ + const VkTimelineSemaphoreSubmitInfo *tlssi = +- vk_find_struct_const(pSubmits[first].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO); ++ vk_find_struct_const(submit_info->pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO); + semaphores[0] = queue_ctx->semaphore; + memcpy(&semaphores[1], submit_info->pSignalSemaphores, + sizeof(VkSemaphore) * submit_info->signalSemaphoreCount); +-- +2.50.1 + + +From aaaa9d5cd9891b88b8a94692f0f49036233da227 Mon Sep 17 00:00:00 2001 From: Kyle Gospodnetich Date: Sun, 18 May 2025 09:40:01 -0700 -Subject: [PATCH 7/8] [BEGIN] Proton-GE Patches +Subject: [PATCH 10/11] [BEGIN] Proton-GE Patches -- 2.50.1 -From 942ac03422d32e31102e3bff506f28ab6aeca18f Mon Sep 17 00:00:00 2001 +From c4bb61d428cc14bc21f9a10f530fd37aa32a4c24 Mon Sep 17 00:00:00 2001 From: Kyle Gospodnetich Date: Sun, 18 May 2025 09:42:23 -0700 -Subject: [PATCH 8/8] radv: min image count patch for Wine Wayland/Path of +Subject: [PATCH 11/11] radv: min image count patch for Wine Wayland/Path of Exile 2 Credit to Glorious Eggroll. --- diff --git a/anda/lib/mesa/mesa.spec b/anda/lib/mesa/mesa.spec index 87cc8f93ea..7932fcdd57 100644 --- a/anda/lib/mesa/mesa.spec +++ b/anda/lib/mesa/mesa.spec @@ -81,7 +81,7 @@ Summary: Mesa graphics libraries # disabled by default, and has to be enabled manually. See `terra/release/terra-mesa.repo` for details. Epoch: 1 Version: 25.2.0 -Release: 1%?dist +Release: 2%?dist License: MIT AND BSD-3-Clause AND SGI-B-2.0 URL: http://www.mesa3d.org @@ -377,7 +377,7 @@ export MESON_PACKAGE_CACHE_DIR="%{cargo_registry}/" -Dgallium-rusticl=true \ %endif -Dvulkan-drivers=%{?vulkan_drivers} \ - -Dvulkan-layers=device-select \ + -Dvulkan-layers=device-select,anti-lag \ -Dshared-glapi=enabled \ -Dgles1=enabled \ -Dgles2=enabled \ @@ -630,7 +630,9 @@ popd %{_libdir}/libvulkan_lvp.so %{_datadir}/vulkan/icd.d/lvp_icd.*.json %{_libdir}/libVkLayer_MESA_device_select.so +%{_libdir}/libVkLayer_MESA_anti_lag.so %{_datadir}/vulkan/implicit_layer.d/VkLayer_MESA_device_select.json +%{_datadir}/vulkan/implicit_layer.d/VkLayer_MESA_anti_lag.json %if 0%{?with_virtio} %{_libdir}/libvulkan_virtio.so %{_datadir}/vulkan/icd.d/virtio_icd.*.json