|
21 | 21 |
|
22 | 22 | #include <array>
|
23 | 23 | #include <cstdint>
|
| 24 | +#include <cstring> |
24 | 25 | #include <stdexcept>
|
25 | 26 | #include <vector>
|
26 | 27 |
|
27 | 28 | #include "imgui.h"
|
28 | 29 | #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64) || defined(_M_AMD64)
|
29 | 30 | #define MEMORY_OBSERVER_X86 // Do not include immintrin/xbyak or use avx intrinsics unless we're compiling for x86
|
| 31 | +#ifdef __GNUC__ |
| 32 | +#define AVX2_FUNC [[gnu::target("avx2")]] |
| 33 | +#else |
| 34 | +#define AVX2_FUNC |
| 35 | +#endif |
30 | 36 | #include "immintrin.h"
|
31 | 37 | #endif
|
32 | 38 |
|
@@ -79,14 +85,14 @@ class MemoryObserver {
|
79 | 85 |
|
80 | 86 | #ifdef MEMORY_OBSERVER_X86
|
81 | 87 | template <int bufferSize>
|
82 |
| - static __m256i avx2_getShuffleResultsFor(const std::array<uint8_t, bufferSize>& buffer, |
| 88 | + AVX2_FUNC static __m256i avx2_getShuffleResultsFor(const std::array<uint8_t, bufferSize>& buffer, |
83 | 89 | std::array<uint8_t, 32>& extendedBuffer, int mask) {
|
84 | 90 | static_assert(bufferSize == 8 || bufferSize == 16);
|
85 | 91 |
|
86 | 92 | for (auto j = 0u; j < (32 / bufferSize); ++j) {
|
87 |
| - std::ranges::copy(buffer, extendedBuffer.begin() + j * bufferSize); |
| 93 | + std::memcpy(&extendedBuffer[j * bufferSize], &buffer[0], bufferSize); |
88 | 94 | }
|
89 |
| - const auto copies = _mm256_loadu_epi8(extendedBuffer.data()); |
| 95 | + const auto copies = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(extendedBuffer.data())); |
90 | 96 |
|
91 | 97 | switch (mask) {
|
92 | 98 | case 0: {
|
@@ -122,40 +128,40 @@ class MemoryObserver {
|
122 | 128 | }
|
123 | 129 |
|
124 | 130 | template <int bufferSize>
|
125 |
| - void simd_populateAddressList(const uint8_t* memData, uint32_t memBase, uint32_t memSize) { |
| 131 | + AVX2_FUNC void simd_populateAddressList(const uint8_t* memData, uint32_t memBase, uint32_t memSize) { |
126 | 132 | static_assert(bufferSize == 8 || bufferSize == 16);
|
127 | 133 |
|
128 | 134 | alignas(32) auto buffer = std::array<uint8_t, bufferSize>{};
|
129 | 135 | alignas(32) auto extendedBuffer = std::array<uint8_t, 32>{};
|
130 | 136 |
|
131 | 137 | const auto sequenceSize = m_sequenceSize;
|
132 | 138 | std::copy_n(m_sequence, sequenceSize, buffer.data());
|
133 |
| - auto patternShuffleResults = std::vector<__m256i>{avx2_getShuffleResultsFor(buffer, extendedBuffer, 0), |
134 |
| - avx2_getShuffleResultsFor(buffer, extendedBuffer, 1)}; |
| 139 | + auto patternShuffleResults = std::vector<__m256i>{avx2_getShuffleResultsFor<bufferSize>(buffer, extendedBuffer, 0), |
| 140 | + avx2_getShuffleResultsFor<bufferSize>(buffer, extendedBuffer, 1)}; |
135 | 141 | if constexpr (bufferSize == 16) {
|
136 |
| - patternShuffleResults.push_back(avx2_getShuffleResultsFor(buffer, extendedBuffer, 2)); |
137 |
| - patternShuffleResults.push_back(avx2_getShuffleResultsFor(buffer, extendedBuffer, 3)); |
| 142 | + patternShuffleResults.push_back(avx2_getShuffleResultsFor<bufferSize>(buffer, extendedBuffer, 2)); |
| 143 | + patternShuffleResults.push_back(avx2_getShuffleResultsFor<bufferSize>(buffer, extendedBuffer, 3)); |
138 | 144 | }
|
139 | 145 |
|
140 | 146 | m_addresses.clear();
|
141 | 147 | for (auto i = 0u; i + sequenceSize < memSize; i += m_step) {
|
142 | 148 | std::copy_n(memData + i, sequenceSize, buffer.data());
|
143 | 149 |
|
144 |
| - bool bAllEqual = true; |
| 150 | + bool allEqual = true; |
145 | 151 | for (auto j = 0u; j < patternShuffleResults.size(); ++j) {
|
146 |
| - bAllEqual = all_equal( |
147 |
| - _mm256_cmpeq_epi8(patternShuffleResults[j], avx2_getShuffleResultsFor(buffer, extendedBuffer, j))); |
148 |
| - if (!bAllEqual) { |
| 152 | + allEqual = all_equal( |
| 153 | + _mm256_cmpeq_epi8(patternShuffleResults[j], avx2_getShuffleResultsFor<bufferSize>(buffer, extendedBuffer, j))); |
| 154 | + if (!allEqual) { |
149 | 155 | break;
|
150 | 156 | }
|
151 | 157 | }
|
152 | 158 |
|
153 |
| - if (bAllEqual) { |
| 159 | + if (allEqual) { |
154 | 160 | m_addresses.push_back(memBase + i);
|
155 | 161 | }
|
156 | 162 | }
|
157 | 163 | }
|
158 |
| - static bool all_equal(__m256i input); |
| 164 | + AVX2_FUNC static bool all_equal(__m256i input); |
159 | 165 | #else
|
160 | 166 | template <int bufferSize>
|
161 | 167 | void simd_populateAddressList(const uint8_t* memData, uint32_t memBase, uint32_t memSize) {
|
|
0 commit comments