builtin-programs/gpu/textures.folk

When the GPU Vulkan handle type definer is /defineVulkanHandleType/ &\
     the GPU library is /gpuLib/ &\
     the GPU VMA DLL is /vmaDll/ &\
     the image library is /imageLib/ {

fn defineVulkanHandleType

set gpuc [C]
$gpuc cflags -I./vendor
$gpuc endcflags $vmaDll
$gpuc code {
    #define VOLK_IMPLEMENTATION
    #include "volk/volk.h"

    #include "vk_mem_alloc.h"

    void vmaInit(VkInstance instance, VkPhysicalDevice physicalDevice, VkDevice device,
                 PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr,
                 PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr,
                 PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties,
                 PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties);
    VmaAllocator vmaGetAllocator();
}
$gpuc include <pthread.h>
$gpuc include <stdlib.h>
$gpuc include <string.h>
$gpuc include <stdatomic.h>

$gpuc extend $gpuLib
$gpuc extend $imageLib

$gpuc include <pthread.h>

$gpuc typedef {struct VmaAllocation_T*} VmaAllocation
$gpuc argtype VmaAllocation {
    VmaAllocation $argname;
    sscanf(Jim_String($obj), "(VmaAllocation) %p", &$argname);
}
$gpuc rtype VmaAllocation {
    char buf[100];
    snprintf(buf, 100, "(VmaAllocation) %p", $rvalue);
    $robj = Jim_NewStringObj(interp, buf, -1);
}

local proc vktry {call} { string map {\n " "} [csubst {{
    VkResult res = $call;
    if (res != VK_SUCCESS) {
        fprintf(stderr, "Failed $call: %s (%d)\n",
                VkResultToString(res), res); exit(1);
    }
}}] }

# Texture management:

# The technique used to manage textures here is to have a
# single giant descriptor set for a giant GPU-side array of
# textures, which all shaders can access. (That descriptor set
# _never_ has to be rebound; it stays bound through all draw
# calls, forever.)
# 
# Each texture has to be 'copied to the GPU' before you do any
# draw calls that use it. Copying an texture to the GPU gives
# you a GPU-side texture handle, which is just an integer index
# into the GPU-side array. You can pass that texture handle
# into draw calls as a parameter (push constant) when you
# want to draw/use the texture.
#
# See:
# - http://kylehalladay.com/blog/tutorial/vulkan/2018/01/28/Textue-Arrays-Vulkan.html
# - https://chunkstories.xyz/blog/a-note-on-descriptor-indexing/
# - https://gist.github.com/DethRaid/0171f3cfcce51950ee4ef96c64f59617
# - http://roar11.com/2019/06/vulkan-textures-unbound/
$gpuc define {
    VkDescriptorSetLayout textureDescriptorSetLayout;
    VkDescriptorSet textureDescriptorSet;
}
defineVulkanHandleType $gpuc VkDescriptorSetLayout
defineVulkanHandleType $gpuc VkDescriptorSet

$gpuc code {
    VkDevice device;
    static void initPlaceholderTexture();
}

$gpuc typedef int GpuTextureHandle
defineVulkanHandleType $gpuc VkImage
defineVulkanHandleType $gpuc VkDeviceMemory
defineVulkanHandleType $gpuc VkImageView
defineVulkanHandleType $gpuc VkSampler
$gpuc struct GpuTextureBlock {
    bool _Atomic alive;
    bool _Atomic retiring;

    int width;
    int height;
    int retireAfterFrame;

    GpuTextureHandle handle;

    VkImage textureImage;
    VmaAllocation textureImageAllocation;
    VkImageView textureImageView;
    VkSampler textureSampler;

    // mostly for debugging:
    char* description;
}
$gpuc code {
    // Array of GpuTextureBlocks. Each element points to all GPU-side
    // data structures associated with a particular texture (that we
    // will destroy when we evict that texture).
    struct GpuTextureBlock* gpuTextures;

    // Deferred descriptor-set updates and resource destruction,
    // drained once per frame on the GPU thread when the GPU is idle.
    enum DeferredTextureOp { DEFERRED_ADD, DEFERRED_FREE };
    struct DeferredTextureEntry {
        enum DeferredTextureOp op;
        GpuTextureHandle handle;
    };
    #define DEFERRED_QUEUE_CAP 256
    struct DeferredTextureEntry deferredQueue[DEFERRED_QUEUE_CAP];
    int _Atomic deferredQueueCount = 0;
    pthread_mutex_t deferredQueueMutex = PTHREAD_MUTEX_INITIALIZER;

    #define TEXTURE_RETIRE_GRACE_FRAMES 2
    int textureFrameEpoch = 0;
}
$gpuc proc textureManagerInit {} void {
    $[vktry volkInitialize()]
    volkLoadInstanceOnly(*instance_ptr());

    device = *device_ptr();
    volkLoadDevice(device);

    gpuTextures = calloc(sizeof(GpuTextureBlock), getMaxTextures());

    // Set up textureDescriptorSetLayout:
    {
        /* VkDescriptorBindingFlags flags[1]; */
        /* flags[0] = VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT; */
        /* flags[0] = VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT; */

        /* VkDescriptorSetLayoutBindingFlagsCreateInfo bindingFlags = {0}; */
        /* bindingFlags.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO; */
        /* bindingFlags.bindingCount = 1; */
        /* bindingFlags.pBindingFlags = flags; */

        VkDescriptorSetLayoutBinding bindings[1];
        memset(bindings, 0, sizeof(bindings));
        bindings[0].binding = 0;
        bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
        bindings[0].descriptorCount = getMaxTextures();
        bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;

        VkDescriptorSetLayoutCreateInfo createInfo = {0};
        createInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
        createInfo.bindingCount = 1;
        createInfo.pBindings = bindings;
        /* createInfo.pNext = &bindingFlags; */

        vkCreateDescriptorSetLayout(device, &createInfo, NULL, textureDescriptorSetLayout_ptr());
    }

    VkDescriptorPool descriptorPool; {
        VkDescriptorPoolSize poolSize = {0};
        poolSize.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
        poolSize.descriptorCount = 512;

        VkDescriptorPoolCreateInfo poolInfo = {0};
        poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
        poolInfo.poolSizeCount = 1;
        poolInfo.pPoolSizes = &poolSize;
        poolInfo.maxSets = 100;
        $[vktry {vkCreateDescriptorPool(device, &poolInfo, NULL, &descriptorPool)}]
    }

    // Set up textureDescriptorSet:
    {
        VkDescriptorSetAllocateInfo allocInfo = {0};
        allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
        allocInfo.descriptorPool = descriptorPool;
        allocInfo.descriptorSetCount = 1;
        allocInfo.pSetLayouts = textureDescriptorSetLayout_ptr();

        $[vktry {vkAllocateDescriptorSets(device, &allocInfo, textureDescriptorSet_ptr())}]
    }
    // Initialize VMA allocator
    vmaInit(*instance_ptr(), *physicalDevice_ptr(), device,
            vkGetInstanceProcAddr,
            vkGetDeviceProcAddr,
            vkGetPhysicalDeviceProperties,
            vkGetPhysicalDeviceMemoryProperties);

    initPlaceholderTexture();
}

# Buffer allocation:
$gpuc code [csubst {
    uint32_t findMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) {
        VkPhysicalDeviceMemoryProperties memProperties;
        vkGetPhysicalDeviceMemoryProperties(*physicalDevice_ptr(), &memProperties);

        for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
            if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
                return i;
            }
        }

        exit(1);
    }
}]

$gpuc typedef size_t VkDeviceSize false
$gpuc typedef uint32_t VkBufferUsageFlags false
$gpuc typedef uint32_t VkMemoryPropertyFlags false
$gpuc proc createBuffer {VkDeviceSize size VkBufferUsageFlags usage VkMemoryPropertyFlags properties
                         VkBuffer* buffer VmaAllocation* allocation} void {
    VkBufferCreateInfo bufferInfo = {0};
    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
    bufferInfo.size = size;
    bufferInfo.usage = usage;
    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;

    VmaAllocationCreateInfo allocInfo = {0};
    allocInfo.usage = VMA_MEMORY_USAGE_AUTO;
    if (properties & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) {
        allocInfo.requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
    }
    if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
        allocInfo.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
    }

    VkResult res = vmaCreateBuffer(vmaGetAllocator(), &bufferInfo, &allocInfo, buffer, allocation, NULL);
    if (res != VK_SUCCESS) {
        fprintf(stderr, "Failed to create buffer with VMA: %d\\n", res);
        exit(1);
    }

#ifdef TRACY_ENABLE
    VmaAllocationInfo vmaInfo;
    vmaGetAllocationInfo(vmaGetAllocator(), *allocation, &vmaInfo);
    TracyCAlloc(*allocation, vmaInfo.size);
#endif
}

# Texture allocation:
$gpuc code [csubst {
    void createImage(uint32_t width, uint32_t height,
                     VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage, VkMemoryPropertyFlags properties,
                     VkImage* image, VmaAllocation* allocation) {
        VkImageCreateInfo imageInfo = {0};
        imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
        imageInfo.imageType = VK_IMAGE_TYPE_2D;
        imageInfo.extent.width = width;
        imageInfo.extent.height = height;
        imageInfo.extent.depth = 1;
        imageInfo.mipLevels = 1;
        imageInfo.arrayLayers = 1;
        imageInfo.format = format;
        imageInfo.tiling = tiling;
        // TODO: this means it can't be drawn right away (validation error).
        imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        imageInfo.usage = usage;
        imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
        imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;

        VmaAllocationCreateInfo allocInfo = {0};
        allocInfo.usage = VMA_MEMORY_USAGE_AUTO;
        if (properties & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) {
            allocInfo.requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
        }
        if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
            allocInfo.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
        }

        VkResult res = vmaCreateImage(vmaGetAllocator(), &imageInfo, &allocInfo, image, allocation, NULL);
        if (res != VK_SUCCESS) {
            fprintf(stderr, "Failed to create image with VMA: %d\\n", res);
            exit(1);
        }

#ifdef TRACY_ENABLE
        VmaAllocationInfo vmaInfo;
        vmaGetAllocationInfo(vmaGetAllocator(), *allocation, &vmaInfo);
        TracyCAlloc(*allocation, vmaInfo.size);
#endif
    }
}]

defineVulkanHandleType $gpuc VkCommandBuffer
defineVulkanHandleType $gpuc VkFence
$gpuc proc beginSingleTimeCommands {} VkCommandBuffer {
    VkCommandBuffer commandBuffer = getCommandBuffer();

    VkCommandBufferBeginInfo beginInfo = {0};
    beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
    beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;

    vkBeginCommandBuffer(commandBuffer, &beginInfo);

    return commandBuffer;
}
$gpuc proc endSingleTimeCommands {VkCommandBuffer commandBuffer VkFence fence} void {
    vkEndCommandBuffer(commandBuffer);

    VkSubmitInfo submitInfo = {0};
    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
    submitInfo.commandBufferCount = 1;
    submitInfo.pCommandBuffers = &commandBuffer;

    pthread_mutex_lock(graphicsQueueMutex_ptr());
    vkQueueSubmit(*graphicsQueue_ptr(), 1, &submitInfo, fence);
    pthread_mutex_unlock(graphicsQueueMutex_ptr());
}

$gpuc code {
    static __thread VkFence _fence = VK_NULL_HANDLE;
}
$gpuc proc getFence {} VkFence {
    if (_fence == VK_NULL_HANDLE) {
        VkFenceCreateInfo fenceInfo = {0};
        fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
        $[vktry {vkCreateFence(device, &fenceInfo, NULL, &_fence)}]
    } else {
        vkResetFences(device, 1, &_fence);
    }
    return _fence;
}

$gpuc typedef int VkFormat false
$gpuc typedef int VkImageLayout false
$gpuc proc transitionImageLayout {VkImage image VkFormat format
                                  VkImageLayout oldLayout VkImageLayout newLayout} void {
    VkFence fence = getFence();

    VkCommandBuffer commandBuffer = beginSingleTimeCommands();

    VkImageMemoryBarrier barrier = {0};
    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
    barrier.oldLayout = oldLayout;
    barrier.newLayout = newLayout;
    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
    barrier.image = image;
    barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
    barrier.subresourceRange.baseMipLevel = 0;
    barrier.subresourceRange.levelCount = 1;
    barrier.subresourceRange.baseArrayLayer = 0;
    barrier.subresourceRange.layerCount = 1;

    VkPipelineStageFlags sourceStage;
    VkPipelineStageFlags destinationStage;
    if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
        barrier.srcAccessMask = 0;
        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;

        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;

        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
        destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
    } else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
        barrier.srcAccessMask = 0;
        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;

        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
        destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
    } else {
        exit(91);
    }
    vkCmdPipelineBarrier(commandBuffer,
                         sourceStage, destinationStage,
                         0,
                         0, NULL,
                         0, NULL,
                         1, &barrier);

    endSingleTimeCommands(commandBuffer, fence);
    // HACK: this wait is so that the commandBuffer is usable afterward.
    vkWaitForFences(device, 1, &fence, VK_TRUE, UINT64_MAX);
}

# Copy 1-channel, 3-channel, or 4-channel im to 4-channel ret.
$gpuc proc copyImageToRgba {Image im Image ret} void {
    FOLK_ENSURE(im.width == ret.width && im.height == ret.height);

    if (im.components == 4) {
        if (ret.bytesPerRow == im.bytesPerRow) {
            // Optimized bulk copy when row stride matches
            memcpy(ret.data, im.data, ret.bytesPerRow * ret.height);
        } else {
            // Row-by-row copy when stride differs
            for (int y = 0; y < im.height; y++) {
                memcpy(ret.data + y*ret.bytesPerRow,
                       im.data + y*im.bytesPerRow,
                       im.width*4);
            }
        }
        return;
    }

    FOLK_ENSURE(im.components == 1 || im.components == 3);
    if (im.components == 3) {
        for (int y = 0; y < im.height; y++) {
            for (int x = 0; x < im.width; x++) {
                int imidx = y*im.bytesPerRow + x*im.components;
                int r = im.data[imidx+0],
                    g = im.data[imidx+1], 
                    b = im.data[imidx+2];

                int ridx = y*ret.bytesPerRow + x*ret.components;
                ret.data[ridx+0] = r;
                ret.data[ridx+1] = g;
                ret.data[ridx+2] = b;
                ret.data[ridx+3] = 255;
            }
        }
    } else {
        for (int y = 0; y < im.height; y++) {
            for (int x = 0; x < im.width; x++) {
                int imidx = y*im.bytesPerRow + x*im.components;
                int r = im.data[imidx],
                    g = im.data[imidx], 
                    b = im.data[imidx];

                int ridx = y*ret.bytesPerRow + x*ret.components;
                ret.data[ridx+0] = r;
                ret.data[ridx+1] = g;
                ret.data[ridx+2] = b;
                ret.data[ridx+3] = 255;
            }
        }
    }
}

$gpuc code [csubst {
    GpuTextureHandle allocateGpuTextureHandle() {
        for (;;) {
            for (int i = 0; i < getMaxTextures(); i++) {
                bool notAlive = false;
                if (atomic_compare_exchange_weak(&gpuTextures[i].alive, &notAlive, true)) {
                    gpuTextures[i].handle = i;
                    return i;
                }
            }
            fprintf(stderr, "gpu/textures: Exceeded GPU max textures (%d):\n", getMaxTextures());
            for (int i = 0; i < getMaxTextures(); i++) {
                fprintf(stderr, "  %d: %s\n", i, gpuTextures[i].alive ? gpuTextures[i].description : "<not alive>");
            }
            struct timespec ts = {0, 5000000};
            nanosleep(&ts, NULL);
        }
    }
}]

$gpuc code {
    static void enqueueDeferredTextureOp(enum DeferredTextureOp op, GpuTextureHandle handle) {
        pthread_mutex_lock(&deferredQueueMutex);
        if (deferredQueueCount >= DEFERRED_QUEUE_CAP) {
            fprintf(stderr, "gpu/textures: Deferred queue full (%d)\n", DEFERRED_QUEUE_CAP);
            exit(1);
        }
        deferredQueue[deferredQueueCount++] = (struct DeferredTextureEntry){op, handle};
        pthread_mutex_unlock(&deferredQueueMutex);
    }

    // Write a single texture slot into the descriptor set. Must only
    // be called on the GPU thread between frames (or during init).
    static void writeTextureDescriptor(GpuTextureHandle textureId) {
        VkDescriptorImageInfo imageInfo = {0};
        imageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
        imageInfo.imageView = gpuTextures[textureId].textureImageView;
        imageInfo.sampler = gpuTextures[textureId].textureSampler;

        VkWriteDescriptorSet descriptorWrite = {0};
        descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
        descriptorWrite.dstSet = *textureDescriptorSet_ptr();
        descriptorWrite.dstBinding = 0;
        descriptorWrite.dstArrayElement = textureId;
        descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
        descriptorWrite.descriptorCount = 1;
        descriptorWrite.pImageInfo = &imageInfo;

        vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, NULL);
    }
}

$gpuc proc initializeDescriptorSet {GpuTextureHandle firstTextureId} void {
    // Hack: if we're not using the descriptor indexing extension,
    // we can't have a partially bound descriptor set, so we need
    // to fill all the slots in the texture array with _something_.
    // We just fill all slots with the first texture for now. See
    // http://roar11.com/2019/06/vulkan-textures-unbound/
    VkDescriptorImageInfo imageInfo = {0};
    imageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
    imageInfo.imageView = gpuTextures[firstTextureId].textureImageView;
    imageInfo.sampler = gpuTextures[firstTextureId].textureSampler;

    VkWriteDescriptorSet descriptorWrites[getMaxTextures()];
    for (int i = 0; i < getMaxTextures(); i++) {
        memset(&descriptorWrites[i], 0, sizeof(VkWriteDescriptorSet));
        descriptorWrites[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
        descriptorWrites[i].dstSet = *textureDescriptorSet_ptr();
        descriptorWrites[i].dstBinding = 0;
        descriptorWrites[i].dstArrayElement = i;
        descriptorWrites[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
        descriptorWrites[i].descriptorCount = 1;
        descriptorWrites[i].pImageInfo = &imageInfo;
    }
    vkUpdateDescriptorSets(device, getMaxTextures(), descriptorWrites, 0, NULL);
}

$gpuc proc addToTextureDescriptorSet {GpuTextureHandle textureId} void {
    enqueueDeferredTextureOp(DEFERRED_ADD, textureId);
}

$gpuc proc getGpuTexture {GpuTextureHandle handle} GpuTextureBlock* {
    return &gpuTextures[handle];
}

# NOTE: The caller must call addToTextureDescriptorSet at some point
# after calling this to actually use the texture.
$gpuc proc createGpuTexture {int width int height int format} GpuTextureBlock* {
    GpuTextureHandle textureId = allocateGpuTextureHandle();
    GpuTextureBlock* block = &gpuTextures[textureId];

    block->width = width;
    block->height = height;
    block->retiring = false;
    block->retireAfterFrame = 0;

    createImage(width, height,
                (VkFormat) format, VK_IMAGE_TILING_OPTIMAL,
                VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
                &block->textureImage, &block->textureImageAllocation);

    // Set up block->textureImageView:
    {
        VkImageViewCreateInfo viewInfo = {0};
        viewInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
        viewInfo.image = block->textureImage;
        viewInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
        viewInfo.format = format;
        viewInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        viewInfo.subresourceRange.baseMipLevel = 0;
        viewInfo.subresourceRange.levelCount = 1;
        viewInfo.subresourceRange.baseArrayLayer = 0;
        viewInfo.subresourceRange.layerCount = 1;
        $[vktry {vkCreateImageView(device, &viewInfo, NULL, &block->textureImageView)}]
    }
    // Set up block->textureSampler:
    {
        VkSamplerCreateInfo samplerInfo = {0};
        samplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
        samplerInfo.magFilter = VK_FILTER_LINEAR;
        samplerInfo.minFilter = VK_FILTER_LINEAR;
        samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT;
        samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
        samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
        samplerInfo.anisotropyEnable = VK_FALSE; // TODO: do we want this?
        samplerInfo.borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK;
        samplerInfo.unnormalizedCoordinates = VK_FALSE;
        samplerInfo.compareEnable = VK_FALSE;
        samplerInfo.compareOp = VK_COMPARE_OP_ALWAYS;
        samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR;
        samplerInfo.mipLodBias = 0.0f;
        samplerInfo.minLod = 0.0f;
        samplerInfo.maxLod = 0.0f;
        $[vktry {vkCreateSampler(device, &samplerInfo, NULL, &block->textureSampler)}]
    }

    block->description = malloc(32);
    snprintf(block->description, 32, "%dx%d texture", width, height);
    return block;
}

# Per-worker reusable texture upload slots. Staging buffers stay alive
# until their slot is reused and the previous upload fence has signaled.
$gpuc code {
    #define INFLIGHT_UPLOADS 8
    struct InflightUpload {
        VkCommandBuffer cmdBuffer;
        VkFence fence;
        VkBuffer stagingBuffer;
        VmaAllocation stagingBufferAllocation;
        bool inUse;
    };
    static __thread struct InflightUpload _inflightUploads[INFLIGHT_UPLOADS];
    static __thread int _inflightUploadsNext = 0;

    // Reclaim a slot: if it holds an outstanding upload, wait for its
    // fence, destroy its staging buffer, reset fence. Returns the slot
    // with fence + cmdBuffer allocated and ready to use.
    static struct InflightUpload* acquireInflightUpload() {
        struct InflightUpload* slot = &_inflightUploads[_inflightUploadsNext];
        _inflightUploadsNext = (_inflightUploadsNext + 1) % INFLIGHT_UPLOADS;

        if (slot->inUse) {
#ifdef TRACY_ENABLE
            TracyCZoneN(ctx, "vkWaitForFences (ring full)", 1);
#endif
            vkWaitForFences(device, 1, &slot->fence, VK_TRUE, UINT64_MAX);
#ifdef TRACY_ENABLE
            TracyCZoneEnd(ctx);
            TracyCFree(slot->stagingBufferAllocation);
#endif
            vmaDestroyBuffer(vmaGetAllocator(),
                             slot->stagingBuffer, slot->stagingBufferAllocation);
            slot->stagingBuffer = VK_NULL_HANDLE;
            slot->stagingBufferAllocation = NULL;
            slot->inUse = false;
        }
        if (slot->fence == VK_NULL_HANDLE) {
            VkFenceCreateInfo fenceInfo = {0};
            fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
            vkCreateFence(device, &fenceInfo, NULL, &slot->fence);
        } else {
            vkResetFences(device, 1, &slot->fence);
        }
        if (slot->cmdBuffer == VK_NULL_HANDLE) {
            VkCommandBufferAllocateInfo allocInfo = {0};
            allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
            allocInfo.commandPool = getCommandPool();
            allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
            allocInfo.commandBufferCount = 1;
            vkAllocateCommandBuffers(device, &allocInfo, &slot->cmdBuffer);
        } else {
            vkResetCommandBuffer(slot->cmdBuffer, 0);
        }
        return slot;
    }
}

$gpuc proc copyImageToGpuTexture {Image im} GpuTextureHandle {
    struct InflightUpload* upload = acquireInflightUpload();

    size_t size = im.width * im.height * 4;
    FOLK_ENSURE(size > 0);

    createBuffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
                 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
                 &upload->stagingBuffer, &upload->stagingBufferAllocation);

    // Copy im to stagingBuffer:
    {
        void* data; vmaMapMemory(vmaGetAllocator(), upload->stagingBufferAllocation, &data);
        Image stagingIm = (Image) {
            .width = im.width, .height = im.height,
            .components = 4,
            .bytesPerRow = im.width * 4,
            .data = data
        };
        copyImageToRgba(im, stagingIm);
        vmaUnmapMemory(vmaGetAllocator(), upload->stagingBufferAllocation);
    }

    // Allocate a texture and texture block:
    GpuTextureBlock* block = createGpuTexture(im.width, im.height, VK_FORMAT_R8G8B8A8_SRGB);

    // Record + submit staging buffer -> image copy. We do NOT wait on
    // the fence here; a later call to acquireInflightUpload will
    // reclaim this slot's staging buffer once the GPU is done.
    {
        VkCommandBuffer commandBuffer = upload->cmdBuffer;

        VkCommandBufferBeginInfo beginInfo = {0};
        beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
        beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
        vkBeginCommandBuffer(commandBuffer, &beginInfo);

        // Transition to transfer destination
        VkImageMemoryBarrier barrier = {0};
        barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barrier.image = block->textureImage;
        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barrier.subresourceRange.baseMipLevel = 0;
        barrier.subresourceRange.levelCount = 1;
        barrier.subresourceRange.baseArrayLayer = 0;
        barrier.subresourceRange.layerCount = 1;
        barrier.srcAccessMask = 0;
        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;

        vkCmdPipelineBarrier(commandBuffer,
                             VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
                             0, 0, NULL, 0, NULL, 1, &barrier);

        // Copy buffer to image
        VkBufferImageCopy region = {0};
        region.bufferOffset = 0;
        region.bufferRowLength = 0;
        region.bufferImageHeight = 0;

        region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        region.imageSubresource.mipLevel = 0;
        region.imageSubresource.baseArrayLayer = 0;
        region.imageSubresource.layerCount = 1;

        region.imageOffset = (VkOffset3D) {0, 0, 0};
        region.imageExtent = (VkExtent3D) {im.width, im.height, 1};
        vkCmdCopyBufferToImage(commandBuffer,
                               upload->stagingBuffer,
                               block->textureImage,
                               VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                               1,
                               &region);

        // Transition to shader read-only
        barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
        barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;

        vkCmdPipelineBarrier(commandBuffer,
                             VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
                             0, 0, NULL, 0, NULL, 1, &barrier);

        vkEndCommandBuffer(commandBuffer);

        VkSubmitInfo submitInfo = {0};
        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
        submitInfo.commandBufferCount = 1;
        submitInfo.pCommandBuffers = &commandBuffer;

        pthread_mutex_lock(graphicsQueueMutex_ptr());
        vkQueueSubmit(*graphicsQueue_ptr(), 1, &submitInfo, upload->fence);
        pthread_mutex_unlock(graphicsQueueMutex_ptr());
    }

    upload->inUse = true;
    addToTextureDescriptorSet(block->handle);

    return block->handle;
}
# Replace a descriptor slot with a different texture. Must only be
# called on the GPU thread between frames (used by canvases).
$gpuc proc replaceInTextureDescriptorSet {GpuTextureHandle oldHandle GpuTextureHandle newHandle} void {
    VkDescriptorImageInfo imageInfo = {0};
    imageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
    imageInfo.imageView = gpuTextures[newHandle].textureImageView;
    imageInfo.sampler = gpuTextures[newHandle].textureSampler;

    VkWriteDescriptorSet descriptorWrite = {0};
    descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
    descriptorWrite.dstSet = *textureDescriptorSet_ptr();
    descriptorWrite.dstBinding = 0;
    descriptorWrite.dstArrayElement = oldHandle;
    descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
    descriptorWrite.descriptorCount = 1;
    descriptorWrite.pImageInfo = &imageInfo;

    vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, NULL);
}

$gpuc proc freeGpuTexture {GpuTextureHandle gim} void {
    enqueueDeferredTextureOp(DEFERRED_FREE, gim);
}

# Actually destroy a texture's GPU resources. Must only be called
# on the GPU thread between frames when the GPU is idle.
$gpuc code {
    static void retireGpuTexture(GpuTextureHandle gim) {
        GpuTextureBlock* block = &gpuTextures[gim];
        if (gim == 0 || !block->alive || block->retiring) return;

        block->retiring = true;
        block->retireAfterFrame = textureFrameEpoch + TEXTURE_RETIRE_GRACE_FRAMES;
    }

    static void destroyGpuTextureResources(GpuTextureHandle gim) {
        GpuTextureBlock* block = &gpuTextures[gim];
        if (gim == 0 || !block->alive) return;

        // Point this descriptor slot at the placeholder texture (slot 0)
        // so later frames don't reference a destroyed image.
        {
            VkDescriptorImageInfo imageInfo = {0};
            imageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
            imageInfo.imageView = gpuTextures[0].textureImageView;
            imageInfo.sampler = gpuTextures[0].textureSampler;

            VkWriteDescriptorSet descriptorWrite = {0};
            descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
            descriptorWrite.dstSet = *textureDescriptorSet_ptr();
            descriptorWrite.dstBinding = 0;
            descriptorWrite.dstArrayElement = gim;
            descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
            descriptorWrite.descriptorCount = 1;
            descriptorWrite.pImageInfo = &imageInfo;

            vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, NULL);
        }

#ifdef TRACY_ENABLE
        TracyCFree(block->textureImageAllocation);
#endif
        vkDestroySampler(device, block->textureSampler, NULL);
        vkDestroyImageView(device, block->textureImageView, NULL);
        vmaDestroyImage(vmaGetAllocator(), block->textureImage, block->textureImageAllocation);

        free(block->description);
        block->description = NULL;
        block->textureImage = VK_NULL_HANDLE;
        block->textureImageAllocation = NULL;
        block->textureImageView = VK_NULL_HANDLE;
        block->textureSampler = VK_NULL_HANDLE;
        block->retiring = false;
        block->retireAfterFrame = 0;
        block->alive = false;
    }

    static void destroyRetiredGpuTextures() {
        for (GpuTextureHandle gim = 1; gim < getMaxTextures(); gim++) {
            GpuTextureBlock* block = &gpuTextures[gim];
            if (block->alive && block->retiring &&
                block->retireAfterFrame <= textureFrameEpoch) {
                destroyGpuTextureResources(gim);
            }
        }
    }
}

# Called on the GPU thread before recording work that may sample textures.
$gpuc proc drainDeferredTextureOps {} void {
    pthread_mutex_lock(&deferredQueueMutex);
    int count = deferredQueueCount;
    struct DeferredTextureEntry localQueue[DEFERRED_QUEUE_CAP];
    memcpy(localQueue, deferredQueue, count * sizeof(struct DeferredTextureEntry));
    deferredQueueCount = 0;
    pthread_mutex_unlock(&deferredQueueMutex);

    for (int i = 0; i < count; i++) {
        switch (localQueue[i].op) {
        case DEFERRED_ADD:
            writeTextureDescriptor(localQueue[i].handle);
            break;
        case DEFERRED_FREE:
            retireGpuTexture(localQueue[i].handle);
            break;
        }
    }

    destroyRetiredGpuTextures();
}

# Called once per GPU frame so retired textures age exactly once,
# even though descriptor work may be drained multiple times.
$gpuc proc beginTextureFrame {} void {
    textureFrameEpoch++;
    drainDeferredTextureOps();
}

$gpuc proc initPlaceholderTexture {} void {
    // Set up a placeholder texture in slot 0 that can always be drawn
    // that we can swap in when textures get invalidated.
    Image debugIm = {
        .width = 128, .height = 128,
        .components = 4,
        .bytesPerRow = 128 * 4,
        .data = malloc(128 * 128 * 4)
    };
    for (int y = 0; y < debugIm.height; y++) {
        for (int x = 0; x < debugIm.width; x++) {
            int i = y * debugIm.bytesPerRow + x * debugIm.components;
            debugIm.data[i+0] = 255;
            debugIm.data[i+1] = 0;
            debugIm.data[i+2] = 255;
            debugIm.data[i+3] = 255;
        }
    }
    GpuTextureHandle han = copyImageToGpuTexture(debugIm);
    FOLK_ENSURE(han == 0);

    // Fill all descriptor slots with the placeholder texture, then
    // drain the queued DEFERRED_ADD for slot 0 (which is now redundant
    // but harmless).
    initializeDescriptorSet(han);
    drainDeferredTextureOps();
}

set gpuTextureLib [$gpuc compile]

$gpuTextureLib textureManagerInit

Claim the GPU texture library is $gpuTextureLib

When /someone/ wishes the GPU loads image /im/ as texture {
    set gtex [$gpuTextureLib copyImageToGpuTexture $im]
    Claim the GPU has loaded image $im as texture $gtex \
        -destructor [list $gpuTextureLib freeGpuTexture $gtex]
}

}