Skip to content

Commit 5399bf1

Browse files
Sonicadvance1flightlessmango
authored andcommitted
FEX: Update FEX stats semantics.
FEX updated some [guarantees](FEX-Emu/FEX#4984) to enforce 16-byte size alignment of the thread stats, and introduced a new variable for declaring the size of the struct. This codifies the guarantee that the fex struct will stay 16-byte aligned sizes, so we get ARM's single-copy atomicity guarantees, and if FEX decides to add new members to the struct then things won't suddenly break. FEX can't guarantee that the API isn't going to break, but it is going to attempt only adding values to the end as new values get added, so it is less likely. A minor change to the `atomic_copy_thread_stats` function as well to ensure we're taking avantage of that single-copy atomicity.
1 parent e96a0bf commit 5399bf1

File tree

1 file changed

+37
-15
lines changed

1 file changed

+37
-15
lines changed

src/fex.cpp

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ enum class AppType : uint8_t {
3939
struct fex_stats_header {
4040
uint8_t Version;
4141
AppType app_type;
42-
uint8_t _pad[2];
42+
uint16_t thread_stats_size;
4343
char fex_version[48];
4444
// Atomic variables. std::atomic_ref isn't available until C++20, so need to use GCC builtin atomics to access.
4545
uint32_t Head;
@@ -60,6 +60,9 @@ struct fex_thread_stats {
6060
uint64_t AccumulatedFloatFallbackCount;
6161
};
6262

63+
// This is guaranteed by FEX.
64+
static_assert(sizeof(fex_thread_stats) % 16 == 0, "");
65+
6366
// Sampled stats information
6467
struct fex_stats {
6568
int pid {-1};
@@ -71,6 +74,7 @@ struct fex_stats {
7174
size_t page_size{};
7275

7376
void* shm_base{};
77+
size_t fex_thread_stats_size {};
7478
fex_stats_header* head{};
7579
fex_thread_stats* stats{};
7680

@@ -179,22 +183,26 @@ static uint64_t get_cycle_counter_frequency() {
179183
}
180184
#endif
181185

182-
#if defined(__x86_64__) || defined(__i386__)
183186
static void atomic_copy_thread_stats(fex_thread_stats *dest, const fex_thread_stats *src) {
184-
// Use SSE to copy the thread stats to avoid tearing on 32-bit.
185-
static_assert(sizeof(fex_thread_stats) % 16 == 0, "");
186-
static_assert((sizeof(fex_thread_stats) / 16) == 3, "");
187-
auto d = reinterpret_cast<__m128*>(dest);
188-
auto s = reinterpret_cast<const __m128*>(src);
189-
d[0] = s[0];
190-
d[1] = s[1];
191-
d[2] = s[2];
192-
}
187+
#if defined(__x86_64__) || defined(__i386__)
188+
// For x86 platforms, XMM copies are atomic when aligned. So this guarantees single-copy atomicity.
189+
// x86 has no equivalent of an true "atomic" 128-bit GPR loadstore until APX.
190+
// For FEX emulating x86 platforms, this is also a guarantee for ARMv8.4 and newer.
191+
using copy_type = __m128;
193192
#else
194-
void atomic_copy_thread_stats(fex_thread_stats *dest, const fex_thread_stats *src) {
195-
*dest = *src;
196-
}
193+
// For ARM64 platforms this is basically guaranteed to turn in to ldp+stp.
194+
// For ARM8.4 this gives us single-copy atomicity guarantees.
195+
using copy_type = __uint128_t;
197196
#endif
197+
198+
const auto elements_to_copy = g_stats.fex_thread_stats_size / sizeof(copy_type);
199+
auto d_i = reinterpret_cast<copy_type*>(dest);
200+
auto s_i = reinterpret_cast<const copy_type*>(src);
201+
for (size_t i = 0; i < elements_to_copy; ++i) {
202+
d_i[i] = s_i[i];
203+
}
204+
}
205+
198206
static void destroy_shm() {
199207
munmap(g_stats.shm_base, g_stats.shm_size);
200208
close(g_stats.shm_fd);
@@ -267,6 +275,14 @@ static void init_shm(int pid) {
267275
g_stats.previous_sample_period = std::chrono::steady_clock::now();
268276
g_stats.first_sample = true;
269277
g_stats.sampled_stats.clear();
278+
279+
g_stats.fex_thread_stats_size = sizeof(fex_thread_stats);
280+
281+
if (g_stats.head->thread_stats_size) {
282+
// If thread stats size is provided, use that, as long as it is smaller than tracking size.
283+
g_stats.fex_thread_stats_size = std::min<size_t>(g_stats.head->thread_stats_size, g_stats.fex_thread_stats_size);
284+
}
285+
270286
fex_version = std::string {header->fex_version, strnlen(header->fex_version, sizeof(header->fex_version))};
271287
sigbus_counts.account_time(g_stats.previous_sample_period);
272288
smc_counts.account_time(g_stats.previous_sample_period);
@@ -298,6 +314,12 @@ static void check_shm_update_necessary() {
298314
g_stats.shm_base = mmap(nullptr, new_shm_size, PROT_READ, MAP_SHARED, g_stats.shm_fd, 0);
299315
g_stats.head = reinterpret_cast<fex_stats_header*>(g_stats.shm_base);
300316
g_stats.stats = offset_to_stats(g_stats.shm_base, &g_stats.head->Head);
317+
g_stats.fex_thread_stats_size = sizeof(fex_thread_stats);
318+
319+
if (g_stats.head->thread_stats_size) {
320+
// If thread stats size is provided, use that, as long as it is smaller than tracking size.
321+
g_stats.fex_thread_stats_size = std::min<size_t>(g_stats.head->thread_stats_size, g_stats.fex_thread_stats_size);
322+
}
301323
}
302324

303325
bool is_fex_pid_found() {
@@ -375,7 +397,7 @@ void update_fex_stats() {
375397
accumulate(total_smc_events, AccumulatedSMCEvents);
376398
accumulate(total_softfloat_events, AccumulatedFloatFallbackCount);
377399

378-
memcpy(&it->second.previous, &it->second.current, sizeof(fex_thread_stats));
400+
memcpy(&it->second.previous, &it->second.current, g_stats.fex_thread_stats_size);
379401

380402
total_jit_time += total_time;
381403
if ((now - it->second.last_seen) >= std::chrono::seconds(MAXIMUM_THREAD_WAIT_TIME)) {

0 commit comments

Comments
 (0)