Skip to content

55% strlen and memchr optimization with SIMD on x86-64 | Macros config SIMD #8421

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 247 additions & 0 deletions imgui.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1960,6 +1960,253 @@ ImVec2 ImTriangleClosestPoint(const ImVec2& a, const ImVec2& b, const ImVec2& c,
// [SECTION] MISC HELPERS/UTILITIES (String, Format, Hash functions)
//-----------------------------------------------------------------------------

#if defined IMGUI_ENABLE_AVX2_IMSTRLEN
size_t ImStrlen(const char* str)
{
const size_t SIMD_LENGTH = 32;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;

const unsigned char* begin = (unsigned char*)str;
const unsigned char* ptr = begin;

// first page
{
const size_t PAGE_LENGTH = 4096;
const size_t PAGE_LENGTH_MASK = PAGE_LENGTH - 1;

const unsigned char* page_end = (const unsigned char*)_andn_u64(PAGE_LENGTH_MASK, (uintptr_t)ptr + PAGE_LENGTH_MASK);
const unsigned char* align_page_end = (const unsigned char*)(page_end - SIMD_LENGTH);

// if ptr is far the end of page
if (ptr <= align_page_end)
{
__m256i target = _mm256_setzero_si256();

// if ptr not aligned, align ptr to SIMD_LENGTH
if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m256i chunk = _mm256_lddqu_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));

if (mask)
return (uintptr_t)(ptr - begin + _tzcnt_u32(mask));

ptr = (const unsigned char*)_andn_u64(SIMD_LENGTH_MASK, (uintptr_t)ptr + SIMD_LENGTH_MASK);
}

// main loop of first page
for (; ptr <= align_page_end; ptr += SIMD_LENGTH)
{
__m256i chunk = _mm256_load_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));

if (mask)
return (uintptr_t)(ptr - begin + _tzcnt_u32(mask));

_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}

// if ptr is near the end of page
for (; ptr < page_end; ptr++)
{
if (!(*ptr))
return (uintptr_t)(ptr - begin);
}
}

__m256i target = _mm256_setzero_si256();

// main loop
for (; ; ptr += SIMD_LENGTH)
{
__m256i chunk = _mm256_load_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));

if (mask)
return (uintptr_t)(ptr - begin + _tzcnt_u32(mask));

_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}
#elif defined IMGUI_ENABLE_SSE_IMSTRLEN
size_t ImStrlen(const char* str)
{
const size_t SIMD_LENGTH = 16;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;

const unsigned char* begin = (unsigned char*)str;
const unsigned char* ptr = begin;
const unsigned char ch = '\0';

// first page
{
const size_t PAGE_LENGTH = 4096;
const size_t PAGE_LENGTH_MASK = PAGE_LENGTH - 1;

const unsigned char* page_end = (const unsigned char*)(((uintptr_t)ptr + PAGE_LENGTH_MASK) & ~PAGE_LENGTH_MASK);
const unsigned char* align_page_end = (const unsigned char*)(page_end - SIMD_LENGTH);

// if ptr is far the end of page
if (ptr <= align_page_end)
{
__m128i target = _mm_set1_epi8(ch);

// if ptr not aligned, align ptr to SIMD_LENGTH
if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m128i chunk = _mm_lddqu_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));

if (mask)
return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin);

ptr = (const unsigned char*)(((uintptr_t)ptr + SIMD_LENGTH_MASK) & ~SIMD_LENGTH_MASK);
}

// main loop of first page
for (; ptr <= align_page_end; ptr += SIMD_LENGTH)
{
__m128i chunk = _mm_load_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));

if (mask)
return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin);

_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}

// if ptr is near the end of page
for (; ptr < page_end; ptr++)
{
if (*ptr == ch)
return (uintptr_t)(ptr - begin);
}
}

__m128i target = _mm_set1_epi8(ch);

// main loop
for (; ; ptr += SIMD_LENGTH)
{
__m128i chunk = _mm_load_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));

if (mask)
return (uintptr_t)(ptr + _tzcnt_u32(mask) - begin);

_mm_prefetch((const char*)ptr + 1024, _MM_HINT_T0);
}
}
#else
size_t ImStrlen(const char* str)
{
return strlen(str);
}
#endif

#if defined IMGUI_ENABLE_AVX2_IMMEMCHR
const void* ImMemchr(const void* buf, int val, size_t count)
{
const size_t SIMD_LENGTH = 32;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;

const unsigned char* ptr = (const unsigned char*)buf;
const unsigned char* end = ptr + count;
const unsigned char* align_end = end - SIMD_LENGTH;
const unsigned char ch = (const unsigned char)val;

if (ptr <= align_end)
{
const __m256i target = _mm256_set1_epi8(ch);

if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m256i chunk = _mm256_lddqu_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

ptr = (const unsigned char*)_andn_u64(SIMD_LENGTH_MASK, (uintptr_t)ptr + SIMD_LENGTH_MASK);
}

for (; ptr <= align_end; ptr += SIMD_LENGTH)
{
__m256i chunk = _mm256_load_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

if (ptr <= end - 1024)
_mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0);
}
}

for (; ptr < end; ptr++)
{
if (*ptr == ch)
return (const void*)(ptr);
}

return nullptr;
}
#elif defined IMGUI_ENABLE_SSE_IMMEMCHR
const void* ImMemchr(const void* buf, int val, size_t count)
{
const size_t SIMD_LENGTH = 16;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;

const unsigned char* ptr = (const unsigned char*)buf;
const unsigned char* end = ptr + count;
const unsigned char* align_end = end - SIMD_LENGTH;
const unsigned char ch = (const unsigned char)val;

if (ptr <= align_end)
{
const __m128i target = _mm_set1_epi8(ch);

if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m128i chunk = _mm_lddqu_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

ptr = (const unsigned char*)(((uintptr_t)ptr + SIMD_LENGTH_MASK) & ~SIMD_LENGTH_MASK);
}

for (; ptr <= align_end; ptr += SIMD_LENGTH)
{
__m128i chunk = _mm_load_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

if (ptr <= end - 1024)
_mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0);
}
}

for (; ptr < end; ptr++)
{
if (*ptr == ch)
return (const void*)(ptr);
}

return nullptr;
}
#else
const void* ImMemchr(const void* buf, int val, size_t count)
{
return memchr(buf, val, count);
}
#endif

// Consider using _stricmp/_strnicmp under Windows or strcasecmp/strncasecmp. We don't actually use either ImStricmp/ImStrnicmp in the codebase any more.
int ImStricmp(const char* str1, const char* str2)
{
Expand Down
39 changes: 32 additions & 7 deletions imgui_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,45 @@ Index of this file:
#include <math.h> // sqrtf, fabsf, fmodf, powf, floorf, ceilf, cosf, sinf
#include <limits.h> // INT_MIN, INT_MAX

// Enable SSE intrinsics if available
#if (defined __SSE__ || defined __x86_64__ || defined _M_X64 || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE)
// Include compiler-specific intrinsics header
#if !defined(IMGUI_DISABLE_SIMD)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) || defined(__clang__)
#include <x86intrin.h>
#endif
#endif

// Enable SIMD x86-64 intrinsics if available
#if (defined __x86_64__ || defined _M_X64) && !defined(IMGUI_DISABLE_SIMD)
#if (defined __SSE__ || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE)
#define IMGUI_ENABLE_SSE
#include <immintrin.h>
#if (defined __AVX__ || defined __SSE4_2__)
#endif
#if defined (__SSE4_2__) && !defined(IMGUI_DISABLE_SSE4_2)
#define IMGUI_ENABLE_SSE4_2
#include <nmmintrin.h>
#endif
#if (defined __AVX__) && !defined(IMGUI_DISABLE_AVX)
#define IMGUI_ENABLE_AVX
#endif
#if (defined __AVX2__) && !defined(IMGUI_DISABLE_AVX2)
#define IMGUI_ENABLE_AVX2
#endif
#endif

// Emscripten has partial SSE 4.2 support where _mm_crc32_u32 is not available. See https://emscripten.org/docs/porting/simd.html#id11 and #8213
#if defined(IMGUI_ENABLE_SSE4_2) && !defined(IMGUI_USE_LEGACY_CRC32_ADLER) && !defined(__EMSCRIPTEN__)
#define IMGUI_ENABLE_SSE4_2_CRC
#endif

// Only AVX2 supports integer and byte instructions for 256-bit registers. Implementation this on AVX1 is not possible.
#if defined(IMGUI_ENABLE_AVX2)
#define IMGUI_ENABLE_AVX2_IMSTRLEN
#define IMGUI_ENABLE_AVX2_IMMEMCHR
#elif defined(IMGUI_ENABLE_AVX) || defined(IMGUI_ENABLE_SSE)
#define IMGUI_ENABLE_SSE_IMSTRLEN
#define IMGUI_ENABLE_SSE_IMMEMCHR
#endif

// Visual Studio warnings
#ifdef _MSC_VER
#pragma warning (push)
Expand Down Expand Up @@ -372,8 +397,8 @@ static inline bool ImIsPowerOfTwo(ImU64 v) { return v != 0 && (v &
static inline int ImUpperPowerOfTwo(int v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; }

// Helpers: String
#define ImStrlen strlen
#define ImMemchr memchr
IMGUI_API size_t ImStrlen(const char* str); // Compute the length of a null-terminated string.
IMGUI_API const void* ImMemchr(const void* buf, int val, size_t count); // Find first occurrence of 'val' in buffer given length.
IMGUI_API int ImStricmp(const char* str1, const char* str2); // Case insensitive compare.
IMGUI_API int ImStrnicmp(const char* str1, const char* str2, size_t count); // Case insensitive compare to a certain count.
IMGUI_API void ImStrncpy(char* dst, const char* src, size_t count); // Copy to a certain count and always zero terminate (strncpy doesn't).
Expand Down
Loading