From 707ba8c4726a57c3486420c858f9b9cddd417c03 Mon Sep 17 00:00:00 2001 From: Kraionix Date: Fri, 21 Feb 2025 21:34:27 +0700 Subject: [PATCH 1/2] Merged feature/simd-memchr as a single commit. --- imgui.cpp | 101 +++++++++++++++++++++++++++++++++++++++++++++++ imgui_internal.h | 35 +++++++++++++--- 2 files changed, 130 insertions(+), 6 deletions(-) diff --git a/imgui.cpp b/imgui.cpp index cac29a932747..8c15f68067b4 100644 --- a/imgui.cpp +++ b/imgui.cpp @@ -1960,6 +1960,107 @@ ImVec2 ImTriangleClosestPoint(const ImVec2& a, const ImVec2& b, const ImVec2& c, // [SECTION] MISC HELPERS/UTILITIES (String, Format, Hash functions) //----------------------------------------------------------------------------- +#if defined IMGUI_ENABLE_AVX2_IMMEMCHR +const void* ImMemchr(const void* buf, int val, size_t count) +{ + const size_t SIMD_LENGTH = 32; + const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1; + + const unsigned char* ptr = (const unsigned char*)buf; + const unsigned char* end = ptr + count; + const unsigned char* align_end = end - SIMD_LENGTH; + const unsigned char ch = (const unsigned char)val; + + if (ptr <= align_end) + { + const __m256i target = _mm256_set1_epi8(ch); + + if ((uintptr_t)ptr & SIMD_LENGTH_MASK) + { + __m256i chunk = _mm256_lddqu_si256((const __m256i*)ptr); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + ptr = (const unsigned char*)_andn_u64(SIMD_LENGTH_MASK, (uintptr_t)ptr + SIMD_LENGTH_MASK); + } + + for (; ptr <= align_end; ptr += SIMD_LENGTH) + { + __m256i chunk = _mm256_load_si256((const __m256i*)ptr); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + if (ptr <= end - 1024) + _mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0); + } + } + + for (; ptr < end; ptr++) + { + if (*ptr == ch) + return (const void*)(ptr); + } + + return nullptr; +} +#elif defined IMGUI_ENABLE_SSE_IMMEMCHR +const void* ImMemchr(const void* buf, int val, size_t count) +{ + const size_t SIMD_LENGTH = 16; + const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1; + + const unsigned char* ptr = (const unsigned char*)buf; + const unsigned char* end = ptr + count; + const unsigned char* align_end = end - SIMD_LENGTH; + const unsigned char ch = (const unsigned char)val; + + if (ptr <= align_end) + { + const __m128i target = _mm_set1_epi8(ch); + + if ((uintptr_t)ptr & SIMD_LENGTH_MASK) + { + __m128i chunk = _mm_lddqu_si128((const __m128i*)ptr); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + ptr = (const unsigned char*)(((uintptr_t)ptr + SIMD_LENGTH_MASK) & ~SIMD_LENGTH_MASK); + } + + for (; ptr <= align_end; ptr += SIMD_LENGTH) + { + __m128i chunk = _mm_load_si128((const __m128i*)ptr); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + if (ptr <= end - 1024) + _mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0); + } + } + + for (; ptr < end; ptr++) + { + if (*ptr == ch) + return (const void*)(ptr); + } + + return nullptr; +} +#else +const void* ImMemchr(const void* buf, int val, size_t count) +{ + return memchr(buf, val, count); +} +#endif + // Consider using _stricmp/_strnicmp under Windows or strcasecmp/strncasecmp. We don't actually use either ImStricmp/ImStrnicmp in the codebase any more. int ImStricmp(const char* str1, const char* str2) { diff --git a/imgui_internal.h b/imgui_internal.h index b7395fcae087..cb36aa2b1e3d 100644 --- a/imgui_internal.h +++ b/imgui_internal.h @@ -58,20 +58,43 @@ Index of this file: #include // sqrtf, fabsf, fmodf, powf, floorf, ceilf, cosf, sinf #include // INT_MIN, INT_MAX -// Enable SSE intrinsics if available -#if (defined __SSE__ || defined __x86_64__ || defined _M_X64 || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE) +// Include compiler-specific intrinsics header +#if !defined(IMGUI_DISABLE_SIMD) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) || defined(__clang__) +#include +#endif +#endif + +// Enable SIMD x86-64 intrinsics if available +#if (defined __x86_64__ || defined _M_X64) && !defined(IMGUI_DISABLE_SIMD) +#if (defined __SSE__ || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE) #define IMGUI_ENABLE_SSE -#include -#if (defined __AVX__ || defined __SSE4_2__) +#endif +#if defined (__SSE4_2__) && !defined(IMGUI_DISABLE_SSE4_2) #define IMGUI_ENABLE_SSE4_2 -#include #endif +#if (defined __AVX__) && !defined(IMGUI_DISABLE_AVX) +#define IMGUI_ENABLE_AVX +#endif +#if (defined __AVX2__) && !defined(IMGUI_DISABLE_AVX2) +#define IMGUI_ENABLE_AVX2 #endif +#endif + // Emscripten has partial SSE 4.2 support where _mm_crc32_u32 is not available. See https://emscripten.org/docs/porting/simd.html#id11 and #8213 #if defined(IMGUI_ENABLE_SSE4_2) && !defined(IMGUI_USE_LEGACY_CRC32_ADLER) && !defined(__EMSCRIPTEN__) #define IMGUI_ENABLE_SSE4_2_CRC #endif +// Only AVX2 supports integer and byte instructions for 256-bit registers. Implementation this on AVX1 is not possible. +#if defined(IMGUI_ENABLE_AVX2) +#define IMGUI_ENABLE_AVX2_IMMEMCHR +#elif defined(IMGUI_ENABLE_AVX) || defined(IMGUI_ENABLE_SSE) +#define IMGUI_ENABLE_SSE_IMMEMCHR +#endif + // Visual Studio warnings #ifdef _MSC_VER #pragma warning (push) @@ -373,7 +396,7 @@ static inline int ImUpperPowerOfTwo(int v) { v--; v |= v >> 1; v |= // Helpers: String #define ImStrlen strlen -#define ImMemchr memchr +IMGUI_API const void* ImMemchr(const void* buf, int val, size_t count); // Find first occurrence of 'val' in buffer given length. IMGUI_API int ImStricmp(const char* str1, const char* str2); // Case insensitive compare. IMGUI_API int ImStrnicmp(const char* str1, const char* str2, size_t count); // Case insensitive compare to a certain count. IMGUI_API void ImStrncpy(char* dst, const char* src, size_t count); // Copy to a certain count and always zero terminate (strncpy doesn't). From c67c2026fdf282ce60c254bbf4e7fad988924871 Mon Sep 17 00:00:00 2001 From: Kraionix Date: Sat, 1 Mar 2025 14:47:26 +0700 Subject: [PATCH 2/2] Merge feature/simd-strlen into master as a single commit --- imgui.cpp | 146 +++++++++++++++++++++++++++++++++++++++++++++++ imgui_internal.h | 4 +- 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/imgui.cpp b/imgui.cpp index 8c15f68067b4..d5aed80f5178 100644 --- a/imgui.cpp +++ b/imgui.cpp @@ -1960,6 +1960,152 @@ ImVec2 ImTriangleClosestPoint(const ImVec2& a, const ImVec2& b, const ImVec2& c, // [SECTION] MISC HELPERS/UTILITIES (String, Format, Hash functions) //----------------------------------------------------------------------------- +#if defined IMGUI_ENABLE_AVX2_IMSTRLEN +size_t ImStrlen(const char* str) +{ + const size_t SIMD_LENGTH = 32; + const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1; + + const unsigned char* begin = (unsigned char*)str; + const unsigned char* ptr = begin; + + // first page + { + const size_t PAGE_LENGTH = 4096; + const size_t PAGE_LENGTH_MASK = PAGE_LENGTH - 1; + + const unsigned char* page_end = (const unsigned char*)_andn_u64(PAGE_LENGTH_MASK, (uintptr_t)ptr + PAGE_LENGTH_MASK); + const unsigned char* align_page_end = (const unsigned char*)(page_end - SIMD_LENGTH); + + // if ptr is far the end of page + if (ptr <= align_page_end) + { + __m256i target = _mm256_setzero_si256(); + + // if ptr not aligned, align ptr to SIMD_LENGTH + if ((uintptr_t)ptr & SIMD_LENGTH_MASK) + { + __m256i chunk = _mm256_lddqu_si256((const __m256i*)ptr); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target)); + + if (mask) + return (uintptr_t)(ptr - begin + _tzcnt_u32(mask)); + + ptr = (const unsigned char*)_andn_u64(SIMD_LENGTH_MASK, (uintptr_t)ptr + SIMD_LENGTH_MASK); + } + + // main loop of first page + for (; ptr <= align_page_end; ptr += SIMD_LENGTH) + { + __m256i chunk = _mm256_load_si256((const __m256i*)ptr); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target)); + + if (mask) + return (uintptr_t)(ptr - begin + _tzcnt_u32(mask)); + + _mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0); + } + } + + // if ptr is near the end of page + for (; ptr < page_end; ptr++) + { + if (!(*ptr)) + return (uintptr_t)(ptr - begin); + } + } + + __m256i target = _mm256_setzero_si256(); + + // main loop + for (; ; ptr += SIMD_LENGTH) + { + __m256i chunk = _mm256_load_si256((const __m256i*)ptr); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target)); + + if (mask) + return (uintptr_t)(ptr - begin + _tzcnt_u32(mask)); + + _mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0); + } +} +#elif defined IMGUI_ENABLE_SSE_IMSTRLEN +size_t ImStrlen(const char* str) +{ + const size_t SIMD_LENGTH = 16; + const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1; + + const unsigned char* begin = (unsigned char*)str; + const unsigned char* ptr = begin; + const unsigned char ch = '\0'; + + // first page + { + const size_t PAGE_LENGTH = 4096; + const size_t PAGE_LENGTH_MASK = PAGE_LENGTH - 1; + + const unsigned char* page_end = (const unsigned char*)(((uintptr_t)ptr + PAGE_LENGTH_MASK) & ~PAGE_LENGTH_MASK); + const unsigned char* align_page_end = (const unsigned char*)(page_end - SIMD_LENGTH); + + // if ptr is far the end of page + if (ptr <= align_page_end) + { + __m128i target = _mm_set1_epi8(ch); + + // if ptr not aligned, align ptr to SIMD_LENGTH + if ((uintptr_t)ptr & SIMD_LENGTH_MASK) + { + __m128i chunk = _mm_lddqu_si128((const __m128i*)ptr); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target)); + + if (mask) + return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin); + + ptr = (const unsigned char*)(((uintptr_t)ptr + SIMD_LENGTH_MASK) & ~SIMD_LENGTH_MASK); + } + + // main loop of first page + for (; ptr <= align_page_end; ptr += SIMD_LENGTH) + { + __m128i chunk = _mm_load_si128((const __m128i*)ptr); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target)); + + if (mask) + return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin); + + _mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0); + } + } + + // if ptr is near the end of page + for (; ptr < page_end; ptr++) + { + if (*ptr == ch) + return (uintptr_t)(ptr - begin); + } + } + + __m128i target = _mm_set1_epi8(ch); + + // main loop + for (; ; ptr += SIMD_LENGTH) + { + __m128i chunk = _mm_load_si128((const __m128i*)ptr); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target)); + + if (mask) + return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin); + + _mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0); + } +} +#else +size_t ImStrlen(const char* str) +{ + return strlen(str); +} +#endif + #if defined IMGUI_ENABLE_AVX2_IMMEMCHR const void* ImMemchr(const void* buf, int val, size_t count) { diff --git a/imgui_internal.h b/imgui_internal.h index cb36aa2b1e3d..079729c75f42 100644 --- a/imgui_internal.h +++ b/imgui_internal.h @@ -90,8 +90,10 @@ Index of this file: // Only AVX2 supports integer and byte instructions for 256-bit registers. Implementation this on AVX1 is not possible. #if defined(IMGUI_ENABLE_AVX2) +#define IMGUI_ENABLE_AVX2_IMSTRLEN #define IMGUI_ENABLE_AVX2_IMMEMCHR #elif defined(IMGUI_ENABLE_AVX) || defined(IMGUI_ENABLE_SSE) +#define IMGUI_ENABLE_SSE_IMSTRLEN #define IMGUI_ENABLE_SSE_IMMEMCHR #endif @@ -395,7 +397,7 @@ static inline bool ImIsPowerOfTwo(ImU64 v) { return v != 0 && (v & static inline int ImUpperPowerOfTwo(int v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } // Helpers: String -#define ImStrlen strlen +IMGUI_API size_t ImStrlen(const char* str); // Compute the length of a null-terminated string. IMGUI_API const void* ImMemchr(const void* buf, int val, size_t count); // Find first occurrence of 'val' in buffer given length. IMGUI_API int ImStricmp(const char* str1, const char* str2); // Case insensitive compare. IMGUI_API int ImStrnicmp(const char* str1, const char* str2, size_t count); // Case insensitive compare to a certain count.