Fix for VmaBitScan functions in GCC and Clang. Fixed debug margin for generic algorithm.

Hopefully helps for #231.
Code by @medranSolus
diff --git a/include/vk_mem_alloc.h b/include/vk_mem_alloc.h
index b4677d4..40ad1ad 100644
--- a/include/vk_mem_alloc.h
+++ b/include/vk_mem_alloc.h
@@ -3108,7 +3108,7 @@
         return static_cast<uint8_t>(pos);

     return UINT8_MAX;

 #elif defined __GNUC__ || defined __clang__

-    return static_cast<uint8_t>(__builtin_ffsl(mask)) - 1U;

+    return static_cast<uint8_t>(__builtin_ffs(mask)) - 1U;

 #else

     uint8_t pos = 0;

     uint32_t bit = 1;

@@ -3130,10 +3130,10 @@
         return static_cast<uint8_t>(pos);

 #elif defined __GNUC__ || defined __clang__

     if (mask)

-        return static_cast<uint8_t>(__builtin_clzll(mask));

+        return 63 - static_cast<uint8_t>(__builtin_clzll(mask));

 #else

     uint8_t pos = 63;

-    uint64_t bit = 1u << 63;

+    uint64_t bit = 1ULL << 63;

     do

     {

         if (mask & bit)

@@ -3152,10 +3152,10 @@
         return static_cast<uint8_t>(pos);

 #elif defined __GNUC__ || defined __clang__

     if (mask)

-        return static_cast<uint8_t>(__builtin_clzl(mask));

+        return 31 - static_cast<uint8_t>(__builtin_clz(mask));

 #else

     uint8_t pos = 31;

-    uint32_t bit = 1 << 31;

+    uint32_t bit = 1UL << 31;

     do

     {

         if (mask & bit)

@@ -6961,7 +6961,7 @@
     }

 

     // Start from offset equal to beginning of this suballocation.

-    VkDeviceSize offset = suballoc.offset;

+    VkDeviceSize offset = suballoc.offset + (suballocItem == m_Suballocations.cbegin() ? 0 : GetDebugMargin());

 

     // Apply debugMargin from the end of previous alloc.

     if (debugMargin > 0)

@@ -10341,6 +10341,7 @@
     uint8_t memClass = SizeToMemoryClass(block->size);

     uint16_t secondIndex = SizeToSecondIndex(block->size, memClass);

     uint32_t index = GetListIndex(memClass, secondIndex);

+    VMA_ASSERT(index < m_ListsCount);

     block->PrevFree() = VMA_NULL;

     block->NextFree() = m_FreeList[index];

     m_FreeList[index] = block;

diff --git a/src/Tests.cpp b/src/Tests.cpp
index 843fe1d..94a326e 100644
--- a/src/Tests.cpp
+++ b/src/Tests.cpp
@@ -6667,6 +6667,38 @@
     }
 }
 
+static void BasicTestTLSF()
+{
+    wprintf(L"Basic test TLSF\n");
+
+    VmaVirtualBlock block;
+
+    VmaVirtualBlockCreateInfo blockInfo = {};
+    blockInfo.flags = VMA_VIRTUAL_BLOCK_CREATE_TLSF_ALGORITHM_BIT;
+    blockInfo.size = 50331648;
+    vmaCreateVirtualBlock(&blockInfo, &block);
+
+    VmaVirtualAllocationCreateInfo info = {};
+    info.alignment = 2;
+
+    VmaVirtualAllocation allocation[3] = {};
+
+    info.size = 576;
+    vmaVirtualAllocate(block, &info, allocation + 0, nullptr);
+
+    info.size = 648;
+    vmaVirtualAllocate(block, &info, allocation + 1, nullptr);
+
+    vmaVirtualFree(block, allocation[0]);
+
+    info.size = 720;
+    vmaVirtualAllocate(block, &info, allocation + 2, nullptr);
+
+    vmaVirtualFree(block, allocation[1]);
+    vmaVirtualFree(block, allocation[2]);
+    vmaDestroyVirtualBlock(block);
+}
+
 static void BasicTestBuddyAllocator()
 {
     wprintf(L"Basic test buddy allocator\n");
@@ -7050,6 +7082,7 @@
     ManuallyTestLinearAllocator();
     TestLinearAllocatorMultiBlock();
 
+    BasicTestTLSF();
     BasicTestBuddyAllocator();
     BasicTestAllocatePages();