@@ -39,7 +39,7 @@ enum class AppType : uint8_t {
3939struct  fex_stats_header  {
4040    uint8_t  Version;
4141    AppType app_type;
42-     uint8_t  _pad[ 2 ] ;
42+     uint16_t  thread_stats_size ;
4343    char  fex_version[48 ];
4444    //  Atomic variables. std::atomic_ref isn't available until C++20, so need to use GCC builtin atomics to access.
4545    uint32_t  Head;
@@ -60,6 +60,9 @@ struct fex_thread_stats {
6060    uint64_t  AccumulatedFloatFallbackCount;
6161};
6262
63+ //  This is guaranteed by FEX.
64+ static_assert (sizeof (fex_thread_stats) % 16  == 0 , " " 
65+ 
6366//  Sampled stats information
6467struct  fex_stats  {
6568    int  pid {-1 };
@@ -71,6 +74,7 @@ struct fex_stats {
7174    size_t  page_size{};
7275
7376    void * shm_base{};
77+     size_t  fex_thread_stats_size {};
7478    fex_stats_header* head{};
7579    fex_thread_stats* stats{};
7680
@@ -179,22 +183,26 @@ static uint64_t get_cycle_counter_frequency() {
179183}
180184#endif 
181185
182- #if  defined(__x86_64__) || defined(__i386__)
183186static  void  atomic_copy_thread_stats (fex_thread_stats *dest, const  fex_thread_stats *src) {
184-     //  Use SSE to copy the thread stats to avoid tearing on 32-bit.
185-     static_assert (sizeof (fex_thread_stats) % 16  == 0 , " " 
186-     static_assert ((sizeof (fex_thread_stats) / 16 ) == 3 , " " 
187-     auto  d = reinterpret_cast <__m128*>(dest);
188-     auto  s = reinterpret_cast <const  __m128*>(src);
189-     d[0 ] = s[0 ];
190-     d[1 ] = s[1 ];
191-     d[2 ] = s[2 ];
192- }
187+ #if  defined(__x86_64__) || defined(__i386__)
188+     //  For x86 platforms, XMM copies are atomic when aligned. So this guarantees single-copy atomicity.
189+     //  x86 has no equivalent of an true "atomic" 128-bit GPR loadstore until APX.
190+     //  For FEX emulating x86 platforms, this is also a guarantee for ARMv8.4 and newer.
191+     using  copy_type = __m128;
193192#else 
194- void   atomic_copy_thread_stats (fex_thread_stats *dest,  const  fex_thread_stats *src) { 
195-     *dest = *src; 
196- } 
193+      //  For ARM64 platforms this is basically guaranteed to turn in to ldp+stp. 
194+     //  For ARM8.4 this gives us single-copy atomicity guarantees. 
195+      using  copy_type =  __uint128_t ; 
197196#endif 
197+ 
198+     const  auto  elements_to_copy = g_stats.fex_thread_stats_size  / sizeof (copy_type);
199+     auto  d_i = reinterpret_cast <copy_type*>(dest);
200+     auto  s_i = reinterpret_cast <const  copy_type*>(src);
201+     for  (size_t  i = 0 ; i < elements_to_copy; ++i) {
202+         d_i[i] = s_i[i];
203+     }
204+ }
205+ 
198206static  void  destroy_shm () {
199207    munmap (g_stats.shm_base , g_stats.shm_size );
200208    close (g_stats.shm_fd );
@@ -267,6 +275,14 @@ static void init_shm(int pid) {
267275    g_stats.previous_sample_period  = std::chrono::steady_clock::now ();
268276    g_stats.first_sample  = true ;
269277    g_stats.sampled_stats .clear ();
278+ 
279+     g_stats.fex_thread_stats_size  = sizeof (fex_thread_stats);
280+ 
281+     if  (g_stats.head ->thread_stats_size ) {
282+         //  If thread stats size is provided, use that, as long as it is smaller than tracking size.
283+         g_stats.fex_thread_stats_size  = std::min<size_t >(g_stats.head ->thread_stats_size , g_stats.fex_thread_stats_size );
284+     }
285+ 
270286    fex_version = std::string {header->fex_version , strnlen (header->fex_version , sizeof (header->fex_version ))};
271287    sigbus_counts.account_time (g_stats.previous_sample_period );
272288    smc_counts.account_time (g_stats.previous_sample_period );
@@ -298,6 +314,12 @@ static void check_shm_update_necessary() {
298314    g_stats.shm_base  = mmap (nullptr , new_shm_size, PROT_READ, MAP_SHARED, g_stats.shm_fd , 0 );
299315    g_stats.head  = reinterpret_cast <fex_stats_header*>(g_stats.shm_base );
300316    g_stats.stats  = offset_to_stats (g_stats.shm_base , &g_stats.head ->Head );
317+     g_stats.fex_thread_stats_size  = sizeof (fex_thread_stats);
318+ 
319+     if  (g_stats.head ->thread_stats_size ) {
320+         //  If thread stats size is provided, use that, as long as it is smaller than tracking size.
321+         g_stats.fex_thread_stats_size  = std::min<size_t >(g_stats.head ->thread_stats_size , g_stats.fex_thread_stats_size );
322+     }
301323}
302324
303325bool  is_fex_pid_found () {
@@ -375,7 +397,7 @@ void update_fex_stats() {
375397        accumulate (total_smc_events, AccumulatedSMCEvents);
376398        accumulate (total_softfloat_events, AccumulatedFloatFallbackCount);
377399
378-         memcpy (&it->second .previous , &it->second .current , sizeof (fex_thread_stats) );
400+         memcpy (&it->second .previous , &it->second .current , g_stats. fex_thread_stats_size );
379401
380402        total_jit_time += total_time;
381403        if  ((now - it->second .last_seen ) >= std::chrono::seconds (MAXIMUM_THREAD_WAIT_TIME)) {
0 commit comments