Skip to content

Commit 6f61580

Browse files
committed
Time::HiRes slim down Win32's _gettimeofday() _clock_gettime() polyfills
-remove align padding bytes from struct my_cxt_t{}. unsigned long run_count; is always 4 bytes, the other 3 members are always 8 bytes -cleanup ABI/machine code gen of Win32-only static fn _gettimeofday() It never leaves this TU as a fn ptr. MSVC 2022 -O1/-O2 optimizer can only create unitialzed reg/C stk "holes" for args that are unused in all callers and unused in callee. It can't shift left or collapse any both sides, unused registers/C arguments, in 1 TU, even if no fn ptr if taken in a static function. The new macro remains POSIX-like. -In _GetSystemTimePreciseAsFileTime(), immediatly copy contents of our " &C_auto_u64 " var, to a new C auto var, so the 64-bit value "outputs" or psuedo-retvals of the MS Win API funcs, can be manipulated for the rest of the function's body, completly in CPU registers, with 0% chance of re-reading or pointlessly writing back to the C stack memory address. -Do the same for _gettimeofday_x() when _gettimeofday_x() calls the MS public Win API funcs. -Inside _GetSystemTimePreciseAsFileTime(), hoist/combine/factor out the 2 different callsites of QueryPerformanceCounter() to the root block. All branches will execute QueryPerformanceCounter() anyways. MSVC 2022 refused to hoist the QueryPerformanceCounter() call, around the statement if(MY_CXT.run_count++==0 ||MY_CXT.base_systime_as_filetime.ft_i64>MY_CXT.reset_time){ -add PERL_STATIC_FORCE_INLINE for static funcs like _clock_gettime() that have exactly 1 caller/callsite, usually this is XSUB function with a CV* argument. -add PERL_STATIC_FORCE_INLINE to _gettimeofday(), even though it has 8 different callers/callsites. The reason is because _gettimeofday() has a huge amount of U64 math at its bottom. All the callers then do a huge amount of mostly FP NV/double math, before saving the final NV value to a SV* with NOK_on. To allow the CC to optimize/combine/simplify these 2 large groups of U64 math and NV math, they must be in the same function. So add PERL_STATIC_FORCE_INLINE to _gettimeofday(). sortunsigned long run_count
1 parent bb623af commit 6f61580

File tree

1 file changed

+68
-43
lines changed

1 file changed

+68
-43
lines changed

dist/Time-HiRes/HiRes.xs

Lines changed: 68 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@
5555
# define GCC_DIAG_IGNORE_CPP_COMPAT_RESTORE_STMT GCC_DIAG_RESTORE_STMT
5656
#endif
5757

58+
#ifndef PERL_STATIC_FORCE_INLINE
59+
# define PERL_STATIC_FORCE_INLINE STATIC
60+
#endif
61+
5862
#if PERL_VERSION_GE(5,7,3) && !PERL_VERSION_GE(5,10,1)
5963
# undef SAVEOP
6064
# define SAVEOP() SAVEVPTR(PL_op)
@@ -136,10 +140,10 @@ typedef union {
136140
# define MY_CXT_KEY "Time::HiRes_" XS_VERSION
137141

138142
typedef struct {
139-
unsigned long run_count;
140143
unsigned __int64 base_ticks;
141144
FT_t base_systime_as_filetime;
142145
unsigned __int64 reset_time;
146+
unsigned long run_count;
143147
} my_cxt_t;
144148

145149
static unsigned __int64 tick_frequency = 0;
@@ -190,7 +194,7 @@ START_MY_CXT
190194
for performance reasons */
191195

192196
# undef gettimeofday
193-
# define gettimeofday(tp, not_used) _gettimeofday(aTHX_ tp, not_used)
197+
# define gettimeofday(tp, not_used) ((*(tp) = _gettimeofday_x(aTHX)), 0)
194198

195199
# undef GetSystemTimePreciseAsFileTime
196200
# define GetSystemTimePreciseAsFileTime(out) (void)(*(out) = _GetSystemTimePreciseAsFileTime(aTHX))
@@ -239,78 +243,99 @@ START_MY_CXT
239243
* the pointer for long term Interlocked or Atomic message passing from an
240244
* unknown 2nd OS thread running on another CPU Core.
241245
*/
246+
242247
static FILETIME
243248
_GetSystemTimePreciseAsFileTime(pTHX)
244249
{
245-
dMY_CXT;
246-
FT_t ft;
247-
248-
if (MY_CXT.run_count++ == 0 ||
249-
MY_CXT.base_systime_as_filetime.ft_i64 > MY_CXT.reset_time) {
250-
251-
QueryPerformanceCounter((LARGE_INTEGER*)&MY_CXT.base_ticks);
252-
GetSystemTimeAsFileTime(&MY_CXT.base_systime_as_filetime.ft_val);
253-
ft.ft_i64 = MY_CXT.base_systime_as_filetime.ft_i64;
254-
MY_CXT.reset_time = ft.ft_i64 + MAX_PERF_COUNTER_TICKS;
250+
#define MY_CXTX (*MY_CXT_x)
251+
unsigned __int64 ticks;
252+
unsigned __int64 ticks_mem;
253+
unsigned __int64 timesys;
254+
__int64 diff;
255+
/* If no threads, CC will probably optimize away all MY_CXT_x references
256+
so they directly access the C static global struct. */
257+
my_cxt_t * MY_CXT_x;
258+
259+
QueryPerformanceCounter((LARGE_INTEGER*)&ticks_mem);
260+
/* Inform the CC nothing external or in this fn (ptr aliasing) can ever
261+
rewrite the value in ticks. Increases chance of CC using registers. */
262+
ticks = ticks_mem;
263+
{
264+
dMY_CXT;
265+
MY_CXT_x = &(MY_CXT);
266+
}
267+
if (MY_CXTX.run_count++ == 0 ||
268+
MY_CXTX.base_systime_as_filetime.ft_i64 > MY_CXTX.reset_time) {
269+
MY_CXTX.base_ticks = ticks;
270+
GetSystemTimeAsFileTime(&MY_CXTX.base_systime_as_filetime.ft_val);
271+
timesys = MY_CXTX.base_systime_as_filetime.ft_i64;
272+
MY_CXTX.reset_time = timesys + MAX_PERF_COUNTER_TICKS;
255273
}
256274
else {
257-
__int64 diff;
258-
unsigned __int64 ticks;
259-
QueryPerformanceCounter((LARGE_INTEGER*)&ticks);
260-
ticks -= MY_CXT.base_ticks;
261-
ft.ft_i64 = MY_CXT.base_systime_as_filetime.ft_i64
275+
ticks -= MY_CXTX.base_ticks;
276+
timesys = MY_CXTX.base_systime_as_filetime.ft_i64
262277
+ Const64(IV_1E7) * (ticks / tick_frequency)
263278
+(Const64(IV_1E7) * (ticks % tick_frequency)) / tick_frequency;
264-
diff = ft.ft_i64 - MY_CXT.base_systime_as_filetime.ft_i64;
279+
diff = timesys - MY_CXTX.base_systime_as_filetime.ft_i64;
265280
if (diff < -MAX_PERF_COUNTER_SKEW || diff > MAX_PERF_COUNTER_SKEW) {
266-
MY_CXT.base_ticks += ticks;
267-
GetSystemTimeAsFileTime(&MY_CXT.base_systime_as_filetime.ft_val);
268-
ft.ft_i64 = MY_CXT.base_systime_as_filetime.ft_i64;
281+
MY_CXTX.base_ticks += ticks;
282+
GetSystemTimeAsFileTime(&MY_CXTX.base_systime_as_filetime.ft_val);
283+
timesys = MY_CXTX.base_systime_as_filetime.ft_i64;
269284
}
270285
}
271-
272-
return ft.ft_val;
286+
#undef MY_CXTX
287+
{
288+
FT_t ft;
289+
ft.ft_i64 = timesys;
290+
return ft.ft_val;
291+
}
273292
}
274293

275-
static int
276-
_gettimeofday(pTHX_ struct timeval *tp, void *not_used)
294+
/* former prototype: static int _gettimeofday(pTHX_ struct timeval *tp, void *not_used);
295+
296+
B/c _gettimeofday_x() is not capable of failing, and retval was always
297+
constant 0, and its a static fn that never leaves this TU, repurpose the
298+
retval for something better. */
299+
300+
PERL_STATIC_FORCE_INLINE struct timeval
301+
_gettimeofday_x(pTHX)
277302
{
278303
FT_t ft;
279-
280-
PERL_UNUSED_ARG(not_used);
304+
struct timeval tp;
281305

282306
GetSystemTimePreciseAsFileTime(&ft.ft_val);
283307

284308
/* seconds since epoch */
285-
tp->tv_sec = (long)((ft.ft_i64 - EPOCH_BIAS) / Const64(IV_1E7));
309+
tp.tv_sec = (long)((ft.ft_i64 - EPOCH_BIAS) / Const64(IV_1E7));
286310

287311
/* microseconds remaining */
288-
tp->tv_usec = (long)((ft.ft_i64 / Const64(10)) % Const64(IV_1E6));
312+
tp.tv_usec = (long)((ft.ft_i64 / Const64(10)) % Const64(IV_1E6));
289313

290-
return 0;
314+
return tp;
291315
}
292316

293-
static int
317+
/* force inline it, because XS_Time__HiRes_clock_gettime() is the only caller */
318+
319+
PERL_STATIC_FORCE_INLINE int
294320
_clock_gettime(pTHX_ clockid_t clock_id, struct timespec *tp)
295321
{
296-
switch (clock_id) {
297-
case CLOCK_REALTIME: {
298-
FT_t ft;
322+
FT_t ft;
323+
unsigned __int64 ticks;
324+
unsigned __int64 time_sys;
299325

326+
switch (clock_id) {
327+
case CLOCK_REALTIME:
300328
GetSystemTimePreciseAsFileTime(&ft.ft_val);
301-
tp->tv_sec = (time_t)((ft.ft_i64 - EPOCH_BIAS) / IV_1E7);
302-
tp->tv_nsec = (long)((ft.ft_i64 % IV_1E7) * 100);
329+
time_sys = ft.ft_i64;
330+
tp->tv_sec = (time_t)((time_sys - EPOCH_BIAS) / IV_1E7);
331+
tp->tv_nsec = (long)((time_sys % IV_1E7) * 100);
303332
break;
304-
}
305-
case CLOCK_MONOTONIC: {
306-
unsigned __int64 ticks;
307-
308-
QueryPerformanceCounter((LARGE_INTEGER*)&ticks);
309-
333+
case CLOCK_MONOTONIC:
334+
QueryPerformanceCounter((LARGE_INTEGER*)&ft.ft_i64);
335+
ticks = ft.ft_i64;
310336
tp->tv_sec = (time_t)(ticks / tick_frequency);
311337
tp->tv_nsec = (long)((IV_1E9 * (ticks % tick_frequency)) / tick_frequency);
312338
break;
313-
}
314339
default:
315340
errno = EINVAL;
316341
return 1;

0 commit comments

Comments
 (0)