diff --git a/src/allocator.cpp b/src/allocator.cpp index 30a0698b5b7..6fc9a257900 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -488,6 +488,32 @@ VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memo return memory; } +VkDeviceMemory VkAllocator::allocate_import_host_memory(size_t size, uint32_t memory_type_index, void* host_ptr) +{ + VkMemoryAllocateInfo memoryAllocateInfo; + memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + memoryAllocateInfo.pNext = 0; + memoryAllocateInfo.allocationSize = size; + memoryAllocateInfo.memoryTypeIndex = memory_type_index; + + VkImportMemoryHostPointerInfoEXT importMemoryHostPointerInfo; + importMemoryHostPointerInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT; + importMemoryHostPointerInfo.pNext = 0; + importMemoryHostPointerInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; + importMemoryHostPointerInfo.pHostPointer = host_ptr; + memoryAllocateInfo.pNext = &importMemoryHostPointerInfo; + + VkDeviceMemory memory = 0; + VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkAllocateMemory failed %d", ret); + return 0; + } + + return memory; +} + VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage) { VkImageCreateInfo imageCreateInfo; @@ -1743,6 +1769,490 @@ void VkWeightAllocator::fastFree(VkImageMemory* ptr) } } +class VkHostAllocatorPrivate +{ +public: + size_t block_size; + size_t buffer_offset_alignment; + size_t bind_memory_offset_alignment; + std::vector buffer_block_free_spaces; + std::vector buffer_blocks; + std::vector image_memory_block_free_spaces; + std::vector image_memory_blocks; + std::vector host_ptrs; +}; + +VkHostAllocator::VkHostAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size) + : VkAllocator(_vkdev), d(new VkHostAllocatorPrivate) +{ + d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment(); + d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity(); + + if (vkdev->info.type() == 1) + { + // on integrated gpu, there may be device local only memory too, eg. AMD APU + // assuming larger alignment always keeps us safe :) + + // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size + d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment()); + d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size()); + } + + if (vkdev->info.support_VK_KHR_robustness2() || vkdev->info.support_VK_EXT_robustness2()) + { + size_t robust_storage_buffer_access_size_alignment = vkdev->info.queryRobustness2Properties().robustStorageBufferAccessSizeAlignment; + d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, robust_storage_buffer_access_size_alignment); + } + + if (vkdev->info.support_VK_EXT_external_memory_host()) + { + size_t min_imported_host_pointer_alignment = vkdev->info.queryExternalMemoryHostProperties().minImportedHostPointerAlignment; + d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, min_imported_host_pointer_alignment); + } + + d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment); +} + +VkHostAllocator::~VkHostAllocator() +{ + clear(); + + delete d; +} + +VkHostAllocator::VkHostAllocator(const VkHostAllocator&) + : VkAllocator(0), d(0) +{ +} + +VkHostAllocator& VkHostAllocator::operator=(const VkHostAllocator&) +{ + return *this; +} + +void VkHostAllocator::clear() +{ + // NCNN_LOGE("VkHostAllocator %lu", d->buffer_blocks.size()); + + d->buffer_block_free_spaces.clear(); + + for (size_t i = 0; i < d->buffer_blocks.size(); i++) + { + VkBufferMemory* ptr = d->buffer_blocks[i]; + + if (mappable) + vkUnmapMemory(vkdev->vkdevice(), ptr->memory); + + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); + vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + + delete ptr; + } + d->buffer_blocks.clear(); + + d->image_memory_block_free_spaces.clear(); + + for (size_t i = 0; i < d->image_memory_blocks.size(); i++) + { + VkDeviceMemory memory = d->image_memory_blocks[i]; + + vkFreeMemory(vkdev->vkdevice(), memory, 0); + } + d->image_memory_blocks.clear(); + + for (size_t i = 0; i < d->host_ptrs.size(); i++) + { + void* host_ptr = d->host_ptrs[i]; + + // NCNN_LOGE("host_ptr = %p free", host_ptr); + + ncnn::fastFree(host_ptr); + } + d->host_ptrs.clear(); +} + +// fastMalloc() with alignment parameter and no malloc overread +static void* fastMalloc_with_alignment(size_t size, size_t alignment) +{ +#if _MSC_VER + return _aligned_malloc(size, alignment); +#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17) + void* ptr = 0; + if (posix_memalign(&ptr, alignment, size)) + ptr = 0; + return ptr; +#elif __ANDROID__ && __ANDROID_API__ < 17 + return memalign(alignment, size); +#else + unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + alignment); + if (!udata) + return 0; + unsigned char** adata = alignPtr((unsigned char**)udata + 1, alignment); + adata[-1] = udata; + return adata; +#endif +} + +VkBufferMemory* VkHostAllocator::fastMalloc(size_t size) +{ + // NCNN_LOGE("VkHostAllocator fastMalloc %lu", size); + + size_t aligned_size = alignSize(size, d->buffer_offset_alignment); + + const int buffer_block_count = d->buffer_blocks.size(); + + // find first spare space in buffer_blocks + for (int i = 0; i < buffer_block_count; i++) + { + size_t free_size = d->buffer_block_free_spaces[i]; + if (free_size >= aligned_size) + { + size_t block_offset = d->block_size - free_size; + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = d->buffer_blocks[i]->buffer; + ptr->offset = block_offset; + ptr->memory = d->buffer_blocks[i]->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + d->buffer_block_free_spaces[i] -= aligned_size; + + return ptr; + } + } + + size_t new_block_size = std::max(d->block_size, aligned_size); + + // create new block + VkBufferMemory* block = new VkBufferMemory; + + block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); + block->offset = 0; + + VkMemoryRequirements memoryRequirements; + vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements); + + if (vkdev->info.support_VK_EXT_external_memory_host()) + { + void* host_ptr = fastMalloc_with_alignment(new_block_size, d->buffer_offset_alignment); + + // NCNN_LOGE("host_ptr = %p %lu", host_ptr, new_block_size); + + if (host_ptr) + { + VkMemoryHostPointerPropertiesEXT pointerProperties; + pointerProperties.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT; + pointerProperties.pNext = 0; + VkResult ret = vkdev->vkGetMemoryHostPointerPropertiesEXT(vkdev->vkdevice(), VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, host_ptr, &pointerProperties); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkGetMemoryHostPointerPropertiesEXT failed %d", ret); + ncnn::fastFree(host_ptr); + vkDestroyBuffer(vkdev->vkdevice(), block->buffer, 0); + delete block; + return 0; + } + + // setup memory type and alignment + if (buffer_memory_type_index == (uint32_t)-1) + { + buffer_memory_type_index = vkdev->find_memory_index(pointerProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + mappable = vkdev->is_mappable(buffer_memory_type_index); + coherent = vkdev->is_coherent(buffer_memory_type_index); + } + + block->memory = allocate_import_host_memory(memoryRequirements.size, buffer_memory_type_index, host_ptr); + if (!block->memory) + { + ncnn::fastFree(host_ptr); + } + else + { + d->host_ptrs.push_back(host_ptr); + } + } + } + else + { + // setup memory type and alignment + if (buffer_memory_type_index == (uint32_t)-1) + { + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + mappable = vkdev->is_mappable(buffer_memory_type_index); + coherent = vkdev->is_coherent(buffer_memory_type_index); + } + + block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index); + } + if (!block->memory) + { + vkDestroyBuffer(vkdev->vkdevice(), block->buffer, 0); + delete block; + return 0; + } + + // ignore memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); + + // NCNN_LOGE("VkHostAllocator M %p", block->buffer); + + block->mapped_ptr = 0; + if (mappable) + { + vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); + } + + d->buffer_blocks.push_back(block); + + d->buffer_block_free_spaces.push_back(new_block_size - aligned_size); + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = block->buffer; + ptr->offset = 0; + ptr->memory = block->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = block->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + return ptr; +} + +void VkHostAllocator::fastFree(VkBufferMemory* ptr) +{ + // NCNN_LOGE("VkHostAllocator F %p", ptr->buffer); + + delete ptr; +} + +VkImageMemory* VkHostAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack) +{ + if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64) + { + NCNN_LOGE("elempack must be 1 4 8 16 32 64"); + return 0; + } + + // resolve format + VkFormat format = VK_FORMAT_UNDEFINED; + + if (elemsize / elempack == 4) + { + // fp32 + if (elempack == 1) format = VK_FORMAT_R32_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT; + } + if (elemsize / elempack == 2) + { + // fp16 + if (elempack == 1) format = VK_FORMAT_R16_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT; + } + if (elemsize / elempack == 1) + { + // int8 + if (elempack == 1) format = VK_FORMAT_R8_SINT; + if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 16) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 32) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 64) format = VK_FORMAT_R8G8B8A8_SINT; + } + + // resolve image width height depth + int width = w; + int height = h; + int depth = c; + + // large elempack spills on image w + if (elempack == 8) width *= 2; + if (elempack == 16) width *= 4; + if (elempack == 32) width *= 8; + if (elempack == 64) width *= 16; + + if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d()) + { + NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d()); + return 0; + } + + VkImageMemory* ptr = new VkImageMemory; + + ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + + ptr->width = width; + ptr->height = height; + ptr->depth = depth; + ptr->format = format; + + VkMemoryRequirements memoryRequirements; + vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); + + const size_t size = memoryRequirements.size; + const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment); + + size_t aligned_size = alignSize(size, alignment); + + const int image_memory_block_count = d->image_memory_blocks.size(); + + // find first spare space in buffer_blocks + for (int i = 0; i < image_memory_block_count; i++) + { + // we cannot use image_memory_block_free_spaces[i] directly for base offset alignment + size_t bind_base_offset = d->block_size - d->image_memory_block_free_spaces[i]; + size_t bind_offset = alignSize(bind_base_offset, alignment); + if (d->image_memory_block_free_spaces[i] >= aligned_size + (bind_offset - bind_base_offset)) + { + // bind at memory offset + ptr->memory = d->image_memory_blocks[i]; + ptr->bind_offset = bind_offset; + ptr->bind_capacity = aligned_size; + + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + if (bind_base_offset != bind_offset) + { + // NOTE there is small offset inside bind_base_offset and bind_offset + // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory + // so that memory management could be easier + aligned_size += (bind_offset - bind_base_offset); + + ptr->bind_offset = bind_base_offset; + ptr->bind_capacity = aligned_size; + } + + d->image_memory_block_free_spaces[i] -= aligned_size; + + return ptr; + } + } + + // create new block + size_t new_block_size = std::max(d->block_size, aligned_size); + + if (vkdev->info.support_VK_EXT_external_memory_host()) + { + void* host_ptr = fastMalloc_with_alignment(new_block_size, d->buffer_offset_alignment); + + // NCNN_LOGE("host_ptr = %p %lu", host_ptr, new_block_size); + + if (host_ptr) + { + VkMemoryHostPointerPropertiesEXT pointerProperties; + pointerProperties.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT; + pointerProperties.pNext = 0; + VkResult ret = vkdev->vkGetMemoryHostPointerPropertiesEXT(vkdev->vkdevice(), VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, host_ptr, &pointerProperties); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkGetMemoryHostPointerPropertiesEXT failed %d", ret); + ncnn::fastFree(host_ptr); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + delete ptr; + return 0; + } + + // setup memory type and alignment + if (image_memory_type_index == (uint32_t)-1) + { + image_memory_type_index = vkdev->find_memory_index(pointerProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + mappable = vkdev->is_mappable(image_memory_type_index); + coherent = vkdev->is_coherent(image_memory_type_index); + } + + ptr->memory = allocate_import_host_memory(new_block_size, image_memory_type_index, host_ptr); + if (!ptr->memory) + { + ncnn::fastFree(host_ptr); + } + else + { + d->host_ptrs.push_back(host_ptr); + } + } + } + else + { + // setup memory type and alignment + if (image_memory_type_index == (uint32_t)-1) + { + image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + mappable = vkdev->is_mappable(image_memory_type_index); + coherent = vkdev->is_coherent(image_memory_type_index); + } + + // bind at memory offset + ptr->memory = allocate_memory(new_block_size, image_memory_type_index); + } + if (!ptr->memory) + { + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + delete ptr; + return 0; + } + ptr->bind_offset = 0; + ptr->bind_capacity = aligned_size; + + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + d->image_memory_blocks.push_back(ptr->memory); + d->image_memory_block_free_spaces.push_back(new_block_size - aligned_size); + + return ptr; +} + +void VkHostAllocator::fastFree(VkImageMemory* ptr) +{ + // NCNN_LOGE("VkHostAllocator F %p", ptr->memory); + + if (!ptr->command_refcount) + { + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } +} + class VkStagingAllocatorPrivate { public: @@ -2159,6 +2669,7 @@ VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*w*/, int ptr->image = image; ptr->memory = memory; ptr->imageview = imageview; + ptr->mapped_ptr = 0; ptr->access_flags = 0; ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; diff --git a/src/allocator.h b/src/allocator.h index 7464e8641aa..a29304d2d4d 100644 --- a/src/allocator.h +++ b/src/allocator.h @@ -288,6 +288,7 @@ class NCNN_EXPORT VkAllocator VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage); VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index); VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer); + VkDeviceMemory allocate_import_host_memory(size_t size, uint32_t memory_type_index, void* host_ptr); VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage); VkImageView create_imageview(VkImage image, VkFormat format); @@ -342,6 +343,31 @@ class NCNN_EXPORT VkWeightAllocator : public VkAllocator VkWeightAllocatorPrivate* const d; }; +class VkHostAllocatorPrivate; +class NCNN_EXPORT VkHostAllocator : public VkAllocator +{ +public: + explicit VkHostAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M + virtual ~VkHostAllocator(); + +public: + // release all blocks immediately + virtual void clear(); + +public: + virtual VkBufferMemory* fastMalloc(size_t size); + virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack); + virtual void fastFree(VkImageMemory* ptr); + +private: + VkHostAllocator(const VkHostAllocator&); + VkHostAllocator& operator=(const VkHostAllocator&); + +private: + VkHostAllocatorPrivate* const d; +}; + class VkStagingAllocatorPrivate; class NCNN_EXPORT VkStagingAllocator : public VkAllocator { diff --git a/src/gpu.cpp b/src/gpu.cpp index f9d722095ce..999ef798cf8 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -345,6 +345,7 @@ class GpuInfoPrivate int support_VK_KHR_zero_initialize_workgroup_memory; int support_VK_EXT_buffer_device_address; int support_VK_EXT_descriptor_indexing; + int support_VK_EXT_external_memory_host; int support_VK_EXT_memory_budget; int support_VK_EXT_memory_priority; int support_VK_EXT_queue_family_foreign; @@ -390,6 +391,7 @@ class GpuInfoPrivate VkPhysicalDeviceSubgroupProperties querySubgroupProperties; VkPhysicalDeviceDriverPropertiesKHR queryDriverProperties; VkPhysicalDeviceSubgroupSizeControlPropertiesEXT querySubgroupSizeControlProperties; + VkPhysicalDeviceExternalMemoryHostPropertiesEXT queryExternalMemoryHostProperties; VkPhysicalDeviceCooperativeMatrix2PropertiesNV queryCooperativeMatrix2PropertiesNV; VkPhysicalDeviceCooperativeVectorPropertiesNV queryCooperativeVectorPropertiesNV; @@ -660,6 +662,7 @@ int GpuInfoPrivate::query_extensions() support_VK_KHR_zero_initialize_workgroup_memory = 0; support_VK_EXT_buffer_device_address = 0; support_VK_EXT_descriptor_indexing = 0; + support_VK_EXT_external_memory_host = 0; support_VK_EXT_memory_budget = 0; support_VK_EXT_memory_priority = 0; support_VK_EXT_queue_family_foreign = 0; @@ -746,6 +749,8 @@ int GpuInfoPrivate::query_extensions() support_VK_EXT_buffer_device_address = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_descriptor_indexing") == 0) support_VK_EXT_descriptor_indexing = exp.specVersion; + else if (strcmp(exp.extensionName, "VK_EXT_external_memory_host") == 0) + support_VK_EXT_external_memory_host = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_memory_budget") == 0) support_VK_EXT_memory_budget = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_memory_priority") == 0) @@ -1140,6 +1145,16 @@ void GpuInfoPrivate::query_extension_properties() queryExtensionProperties = &querySubgroupSizeControlProperties; } + // query external memory host + memset(&queryExternalMemoryHostProperties, 0, sizeof(queryExternalMemoryHostProperties)); + queryExternalMemoryHostProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT; + queryExternalMemoryHostProperties.pNext = 0; + if (support_VK_EXT_external_memory_host) + { + queryExternalMemoryHostProperties.pNext = queryExtensionProperties; + queryExtensionProperties = &queryExternalMemoryHostProperties; + } + // query nv cooperative matrix2 memset(&queryCooperativeMatrix2PropertiesNV, 0, sizeof(queryCooperativeMatrix2PropertiesNV)); queryCooperativeMatrix2PropertiesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_PROPERTIES_NV; @@ -1920,6 +1935,11 @@ int GpuInfo::support_VK_EXT_descriptor_indexing() const return d->support_VK_EXT_descriptor_indexing; } +int GpuInfo::support_VK_EXT_external_memory_host() const +{ + return d->support_VK_EXT_external_memory_host; +} + int GpuInfo::support_VK_EXT_memory_budget() const { return d->support_VK_EXT_memory_budget; @@ -2127,6 +2147,11 @@ const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& GpuInfo::querySubgroupSi return d->querySubgroupSizeControlProperties; } +const VkPhysicalDeviceExternalMemoryHostPropertiesEXT& GpuInfo::queryExternalMemoryHostProperties() const +{ + return d->queryExternalMemoryHostProperties; +} + const std::vector& GpuInfo::queryCooperativeMatrixSubProperties() const { return d->queryCooperativeMatrixSubProperties; @@ -3500,6 +3525,8 @@ VulkanDevice::VulkanDevice(int device_index) enabledExtensions.push_back("VK_EXT_buffer_device_address"); if (info.support_VK_EXT_descriptor_indexing()) enabledExtensions.push_back("VK_EXT_descriptor_indexing"); + if (info.support_VK_EXT_external_memory_host()) + enabledExtensions.push_back("VK_EXT_external_memory_host"); if (info.support_VK_EXT_memory_budget()) enabledExtensions.push_back("VK_EXT_memory_budget"); if (info.support_VK_EXT_memory_priority()) @@ -4545,6 +4572,11 @@ int VulkanDevice::init_device_extension() vkGetBufferDeviceAddressEXT = (PFN_vkGetBufferDeviceAddressEXT)vkGetDeviceProcAddr(d->device, "vkGetBufferDeviceAddressEXT"); } + if (info.support_VK_EXT_external_memory_host()) + { + vkGetMemoryHostPointerPropertiesEXT = (PFN_vkGetMemoryHostPointerPropertiesEXT)vkGetDeviceProcAddr(d->device, "vkGetMemoryHostPointerPropertiesEXT"); + } + #if __ANDROID_API__ >= 26 if (info.support_VK_ANDROID_external_memory_android_hardware_buffer()) { diff --git a/src/gpu.h b/src/gpu.h index 743cb991fdb..b0e87adbf21 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -338,6 +338,7 @@ class NCNN_EXPORT GpuInfo int support_VK_KHR_zero_initialize_workgroup_memory() const; int support_VK_EXT_buffer_device_address() const; int support_VK_EXT_descriptor_indexing() const; + int support_VK_EXT_external_memory_host() const; int support_VK_EXT_memory_budget() const; int support_VK_EXT_memory_priority() const; int support_VK_EXT_queue_family_foreign() const; @@ -385,6 +386,7 @@ class NCNN_EXPORT GpuInfo const VkPhysicalDeviceShaderIntegerDotProductProperties& queryShaderIntegerDotProductProperties() const; const VkPhysicalDeviceSubgroupProperties& querySubgroupProperties() const; const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& querySubgroupSizeControlProperties() const; + const VkPhysicalDeviceExternalMemoryHostPropertiesEXT& queryExternalMemoryHostProperties() const; // extension sub properties const std::vector& queryCooperativeMatrixSubProperties() const; @@ -511,6 +513,9 @@ class NCNN_EXPORT VulkanDevice // VK_EXT_buffer_device_address PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT; + // VK_EXT_external_memory_host + PFN_vkGetMemoryHostPointerPropertiesEXT vkGetMemoryHostPointerPropertiesEXT; + #if __ANDROID_API__ >= 26 // VK_ANDROID_external_memory_android_hardware_buffer PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID; diff --git a/src/net.cpp b/src/net.cpp index 0e1800498be..a9400ddd209 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -130,7 +130,14 @@ int NetPrivate::upload_model() // create gpu device allocator if null if (!weight_vkallocator) { - weight_vkallocator = new VkWeightAllocator(vkdev); + if (opt.use_weights_in_host_memory) + { + weight_vkallocator = new VkHostAllocator(vkdev); + } + else + { + weight_vkallocator = new VkWeightAllocator(vkdev); + } } if (!weight_staging_vkallocator) { diff --git a/src/option.cpp b/src/option.cpp index 8dbc210c487..6260c709424 100644 --- a/src/option.cpp +++ b/src/option.cpp @@ -49,7 +49,7 @@ Option::Option() use_tensor_storage = false; use_reserved_1p = false; - use_reserved_2 = false; + use_weights_in_host_memory = false; flush_denormals = 3; diff --git a/src/option.h b/src/option.h index 8cfcdda4d74..a90605cf068 100644 --- a/src/option.h +++ b/src/option.h @@ -110,7 +110,7 @@ class NCNN_EXPORT Option bool use_tensor_storage; bool use_reserved_1p; - bool use_reserved_2; + bool use_weights_in_host_memory; // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero) // default value is 3 diff --git a/src/vulkan_header_fix.h b/src/vulkan_header_fix.h index 147bf7382f3..ab764673b80 100644 --- a/src/vulkan_header_fix.h +++ b/src/vulkan_header_fix.h @@ -1671,4 +1671,31 @@ typedef struct VkPhysicalDeviceVulkanMemoryModelFeatures typedef VkPhysicalDeviceVulkanMemoryModelFeatures VkPhysicalDeviceVulkanMemoryModelFeaturesKHR; #endif // VK_KHR_vulkan_memory_model +#ifndef VK_EXT_external_memory_host +#define VK_EXT_external_memory_host 1 +#define VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT (VkStructureType)1000178000 +#define VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT (VkStructureType)1000178001 +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT (VkStructureType)1000178002 +typedef struct VkImportMemoryHostPointerInfoEXT +{ + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagBits handleType; + void* pHostPointer; +} VkImportMemoryHostPointerInfoEXT; +typedef struct VkMemoryHostPointerPropertiesEXT +{ + VkStructureType sType; + void* pNext; + uint32_t memoryTypeBits; +} VkMemoryHostPointerPropertiesEXT; +typedef struct VkPhysicalDeviceExternalMemoryHostPropertiesEXT +{ + VkStructureType sType; + void* pNext; + VkDeviceSize minImportedHostPointerAlignment; +} VkPhysicalDeviceExternalMemoryHostPropertiesEXT; +typedef VkResult(VKAPI_PTR* PFN_vkGetMemoryHostPointerPropertiesEXT)(VkDevice device, VkExternalMemoryHandleTypeFlagBits handleType, const void* pHostPointer, VkMemoryHostPointerPropertiesEXT* pMemoryHostPointerProperties); +#endif // VK_EXT_external_memory_host + #endif // NCNN_VULKAN_HEADER_FIX_H