Skip to content

Commit 4222853

Browse files
authored
Merge pull request #55 from ylvain/ylvain
Some updates for IntelVectorMath.jl
2 parents cc95f17 + 81bbab9 commit 4222853

File tree

5 files changed

+274
-15
lines changed

5 files changed

+274
-15
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
name = "IntelVectorMath"
22
uuid = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
3-
version = "0.4.1"
3+
version = "0.4.2"
44

55
[deps]
66
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
77

88
[compat]
99
julia = "1.3"
10-
MKL_jll = "2020, 2021"
10+
MKL_jll = "2021, 2022"
1111

1212
[extras]
1313
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

README.md

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
![](https://github.com/JuliaMath/VML.jl/workflows/julia%201.6/badge.svg)
88
![](https://github.com/JuliaMath/VML.jl/workflows/julia%20nightly/badge.svg)
99

10-
This package provides bindings to the Intel MKL [Vector Mathematics Functions](https://software.intel.com/en-us/node/521751).
10+
This package provides bindings to the Intel MKL [Vector Mathematics Functions](https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/vector-mathematical-functions.html).
1111
This is often substantially faster than broadcasting Julia's built-in functions, especially when applying a transcendental function over a large array.
1212
Until Julia 0.6 the package was registered as `VML.jl`.
1313

@@ -65,7 +65,19 @@ implementation, although the exact results may be different. To specify
6565
low accuracy, use `vml_set_accuracy(VML_LA)`. To specify enhanced
6666
performance, use `vml_set_accuracy(VML_EP)`. More documentation
6767
regarding these options is available on
68-
[Intel's website](http://software.intel.com/sites/products/documentation/hpc/mkl/IntelVectorMath/vmldata.htm).
68+
[Intel's website](https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/vector-mathematical-functions.html).
69+
70+
### Denormalized numbers
71+
72+
On some CPU, operations on denormalized numbers are extremely slow. You case use `vml_set_denormalmode(VML_DENORMAL_FAST)`
73+
to handle denormalized numbers as zero. See the `?VML_DENORMAL_FAST` for more information. You can get the
74+
current mode by `vml_get_denormalmode()`. The default is `VML_DENORMAL_ACCURATE`.
75+
76+
### Threads
77+
78+
By default, IntelVectorMath uses multithreading. The maximum number of threads that a call may use
79+
is given by `vml_get_max_threads()`. On most environment this will default to the number of physical
80+
cores available to IntelVectorMath. This behavior can be changed using `vml_set_num_threads(numthreads)`.
6981

7082
## Performance
7183
Summary of Results:
@@ -229,5 +241,12 @@ Next steps for this package
229241

230242

231243
## Advanced
244+
245+
<!-- This does not seems to be true anymore ? No reference to CpuId.jl in the Manifest ?
246+
232247
IntelVectorMath.jl uses [CpuId.jl](https://github.com/m-j-w/CpuId.jl) to detect if your processor supports the newer `avx2` instructions, and if not defaults to `libmkl_vml_avx`. If your system does not have AVX this package will currently not work for you.
233-
If the CPU feature detection does not work for you, please open an issue.
248+
If the CPU feature detection does not work for you, please open an issue. -->
249+
250+
As a quick help to convert benchmark timings into operations-per-cycle, IntelVectorMath.jl
251+
provides `vml_get_cpu_frequency()` which will return the *actual* current frequency of the
252+
CPU in GHz.

src/IntelVectorMath.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,12 @@ for t in (Float32, Float64)
108108
end
109109

110110
export VML_LA, VML_HA, VML_EP, vml_set_accuracy, vml_get_accuracy
111+
export VML_DENORMAL_FAST, VML_DENORMAL_ACCURATE, vml_set_denormalmode, vml_get_denormalmode
112+
export vml_get_max_threads, vml_set_num_threads
113+
export vml_get_cpu_frequency, vml_get_max_cpu_frequency
114+
115+
# do not export, seems to be no-op in 2022
116+
# export VML_FPU_DEFAULT, VML_FPU_FLOAT32, VML_FPU_FLOAT64, VML_FPU_RESTORE, vml_set_fpumode, vml_get_fpumode
117+
111118

112119
end

src/setup.jl

Lines changed: 228 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,251 @@
11
import MKL_jll
22

3+
"""
4+
struct VMLAccuracy
5+
6+
See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
7+
"""
38
struct VMLAccuracy
49
mode::UInt
510
end
11+
Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
12+
m == VML_HA ? "VML_HA" : "VML_EP")
13+
# mkl\include\mkl_vml_defines.h
14+
# VML_HA - when VML_HA is set, high accuracy VML functions are called
15+
# VML_LA - when VML_LA is set, low accuracy VML functions are called
16+
# VML_EP - when VML_EP is set, enhanced performance VML functions are called
17+
# NOTE: VML_HA, VML_LA and VML_EP must not be used in combination
18+
"""
19+
VML_LA :: VMLAccuracy
620
21+
Low Accuracy (LA), which improves performance by reducing accuracy of the two least significant bits.
22+
"""
723
const VML_LA = VMLAccuracy(0x00000001)
24+
"""
25+
VML_HA :: VMLAccuracy
26+
27+
High Accuracy (HA), the default mode. Precision to 1 ulp.
28+
"""
829
const VML_HA = VMLAccuracy(0x00000002)
30+
"""
31+
VML_EP :: VMLAccuracy
32+
33+
Enhanced Performance (EP), which provides better performance at the cost of significantly reduced accuracy.
34+
Approximately half of the bits in the mantissa are correct.
35+
"""
936
const VML_EP = VMLAccuracy(0x00000003)
1037

11-
Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
12-
m == VML_HA ? "VML_HA" : "VML_EP")
13-
38+
39+
"""
40+
struct VMLAccuracy
41+
42+
See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
43+
"""
44+
struct VMLFastDenormal
45+
mode::UInt
46+
end
47+
Base.show(io::IO, m::VMLFastDenormal) = print(io, m == VML_DENORMAL_FAST ? "VML_DENORMAL_FAST" : "VML_DENORMAL_ACCURATE")
48+
# mkl\include\mkl_vml_defines.h
49+
# FTZ & DAZ mode macros
50+
# VML_FTZDAZ_ON - FTZ & DAZ MXCSR mode enabled
51+
# for faster (sub)denormal values processing
52+
# VML_FTZDAZ_OFF - FTZ & DAZ MXCSR mode disabled
53+
# for accurate (sub)denormal values processing
54+
"""
55+
VML_DENORMAL_FAST :: VMLFastDenormal
56+
57+
Designed to improve the performance of computations that involve denormalized numbers at the cost of reasonable accuracy loss.
58+
This mode changes the numeric behavior of the functions: denormalized input values are treated as zeros and denormalized results
59+
are flushed to zero. Accuracy loss may occur if input and/or output values are close to denormal range.
60+
"""
61+
const VML_DENORMAL_FAST = VMLFastDenormal(0x00280000)
62+
"""
63+
VML_DENORMAL_ACCURATE :: VMLFastDenormal
64+
65+
Standard handling of computations that involve denormalized numbers.
66+
"""
67+
const VML_DENORMAL_ACCURATE = VMLFastDenormal(0x00140000)
68+
69+
70+
struct VMLFpuMode
71+
mode::UInt
72+
end
73+
Base.show(io::IO, m::VMLFpuMode) = print(io, m == VML_FPU_DEFAULT ? "VML_FPU_DEFAULT" :
74+
m == VML_FPU_FLOAT32 ? "VML_FPU_FLOAT32" :
75+
m == VML_FPU_FLOAT64 ? "VML_FPU_FLOAT64" : "VML_FPU_RESTORE")
76+
# mkl\include\mkl_vml_defines.h
77+
# SETTING OPTIMAL FLOATING-POINT PRECISION AND ROUNDING MODE
78+
# Definitions below are to set optimal floating-point control word
79+
# (precision and rounding mode).
80+
#
81+
# For their correct work, VML functions change floating-point precision and
82+
# rounding mode (if necessary). Since control word changing is typically
83+
# expensive operation, it is recommended to set precision and rounding mode
84+
# to optimal values before VML function calls.
85+
#
86+
# VML_FLOAT_CONSISTENT - use this value if the calls are typically to single
87+
# precision VML functions
88+
# VML_DOUBLE_CONSISTENT - use this value if the calls are typically to double
89+
# precision VML functions
90+
# VML_RESTORE - restore original floating-point precision and
91+
# rounding mode
92+
# VML_DEFAULT_PRECISION - use default (current) floating-point precision and
93+
# rounding mode
94+
# NOTE: VML_FLOAT_CONSISTENT, VML_DOUBLE_CONSISTENT, VML_RESTORE and
95+
# VML_DEFAULT_PRECISION must not be used in combination
96+
const VML_FPU_DEFAULT = VMLFpuMode(0x00000000) # VML_DEFAULT_PRECISION
97+
const VML_FPU_FLOAT32 = VMLFpuMode(0x00000010) # VML_FLOAT_CONSISTENT
98+
const VML_FPU_FLOAT64 = VMLFpuMode(0x00000020) # VML_DOUBLE_CONSISTENT
99+
const VML_FPU_RESTORE = VMLFpuMode(0x00000030) # VML_RESTORE
100+
101+
# mkl\include\mkl_vml_defines.h
102+
# ACCURACY, FLOATING-POINT CONTROL, FTZDAZ AND ERROR HANDLING MASKS
103+
# Accuracy, floating-point and error handling control are packed in
104+
# the VML mode variable. Macros below are useful to extract accuracy and/or
105+
# floating-point control and/or error handling control settings.
106+
#
107+
# VML_ACCURACY_MASK - extract accuracy bits
108+
# VML_FPUMODE_MASK - extract floating-point control bits
109+
# VML_ERRMODE_MASK - extract error handling control bits
110+
# (including error callback bits)
111+
# VML_ERRMODE_STDHANDLER_MASK - extract error handling control bits
112+
# (not including error callback bits)
113+
# VML_ERRMODE_CALLBACK_MASK - extract error callback bits
114+
# VML_NUM_THREADS_OMP_MASK - extract OpenMP(R) number of threads mode bits
115+
# VML_FTZDAZ_MASK - extract FTZ & DAZ bits
116+
# VML_TRAP_EXCEPTIONS_MASK - extract exception trap bits
117+
const VML_ACCURACY_MASK = 0x0000000F
118+
const VML_FPUMODE_MASK = 0x000000F0
119+
const VML_ERRMODE_MASK = 0x0000FF00
120+
const VML_ERRMODE_STDHANDLER_MASK = 0x00002F00
121+
const VML_ERRMODE_CALLBACK_MASK = 0x00001000
122+
const VML_NUM_THREADS_OMP_MASK = 0x00030000
123+
const VML_FTZDAZ_MASK = 0x003C0000
124+
const VML_TRAP_EXCEPTIONS_MASK = 0x0F000000
125+
126+
# https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/vector-mathematical-functions/vm-service-functions.html
14127
vml_get_mode() = ccall((:vmlGetMode, MKL_jll.libmkl_rt), Cuint, ())
15128
vml_set_mode(mode::Integer) = (ccall((:vmlSetMode, MKL_jll.libmkl_rt), Cuint, (UInt,), mode); nothing)
16129

17-
vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~0x03) | m.mode)
18-
vml_get_accuracy() = VMLAccuracy(vml_get_mode() & 0x3)
130+
"""
131+
vml_set_accuracy([VML_HA | VML_LA | VML_EP]])
132+
133+
Set the current accuracy mode. See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
134+
"""
135+
vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~VML_ACCURACY_MASK) | m.mode)
136+
"""
137+
vml_get_accuracy() :: VMLAccuracy
138+
139+
Get the current accuracy mode. See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
140+
"""
141+
vml_get_accuracy() = VMLAccuracy(vml_get_mode() & VML_ACCURACY_MASK)
142+
143+
"""
144+
vml_set_denormalmode([VML_DENORMAL_FAST | VML_DENORMAL_ACCURATE]])
145+
146+
Set the current mode of denormal handling. See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
147+
"""
148+
vml_set_denormalmode(m::VMLFastDenormal) = vml_set_mode((vml_get_mode() & ~VML_FTZDAZ_MASK) | m.mode)
149+
"""
150+
vml_get_denormalmode() :: VMLFastDenormal
151+
152+
Get the current mode of denormal handling. See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
153+
"""
154+
vml_get_denormalmode() = VMLFastDenormal(vml_get_mode() & VML_FTZDAZ_MASK)
155+
156+
# Ignored with MKL 2022 on i7-5930k, was usefull once upton a time.
157+
vml_set_fpumode(m::VMLFpuMode) = vml_set_mode((vml_get_mode() & ~VML_FPUMODE_MASK) | m.mode)
158+
vml_get_fpumode() = VMLFpuMode(vml_get_mode() & VML_FPUMODE_MASK)
159+
160+
# -----------------------------------------------------------------------------------------------
161+
162+
# https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/support-functions/threading-control.html
163+
#
164+
# See: mkl\include\mkl_service.h
165+
# _Mkl_Api(int,MKL_Domain_Set_Num_Threads,(int nth, int MKL_DOMAIN))
166+
# _Mkl_Api(int,MKL_Domain_Get_Max_Threads,(int MKL_DOMAIN))
167+
# #define mkl_domain_set_num_threads MKL_Domain_Set_Num_Threads
168+
# #define mkl_domain_get_max_threads MKL_Domain_Get_Max_Threads
169+
#
170+
# See: mkl\include\mkl_types.h
171+
# define MKL_DOMAIN_ALL 0
172+
# define MKL_DOMAIN_BLAS 1
173+
# define MKL_DOMAIN_FFT 2
174+
const MKL_DOMAIN_VML = 0x3
175+
# define MKL_DOMAIN_PARDISO 4
176+
177+
"""
178+
vml_get_max_threads() :: Int
179+
180+
Maximum number of threads that VML may use. By default, or after a call to `vml_set_num_threads(0)`,
181+
should return the number of cores available to VML.
182+
"""
183+
vml_get_max_threads() = Int(ccall((:MKL_Domain_Get_Max_Threads, MKL_jll.libmkl_rt), Cint, (Cint,), MKL_DOMAIN_VML))
184+
"""
185+
vml_set_num_threads(numthreads::Int) :: Bool
186+
187+
Set the maximum number of threads that VML may use. Use `numthreads=0` to restore the default.
188+
Return `true` if the operation completed successfully.
189+
"""
190+
vml_set_num_threads(numthreads::Int) = Bool(ccall((:MKL_Domain_Set_Num_Threads, MKL_jll.libmkl_rt), Cuint, (Cint,Cint), numthreads, MKL_DOMAIN_VML))
191+
192+
# See: mkl\include\mkl_service.h
193+
# _Mkl_Api(double,MKL_Get_Cpu_Frequency,(void)) /* Gets CPU frequency in GHz */
194+
# _Mkl_Api(double,MKL_Get_Max_Cpu_Frequency,(void)) /* Gets max CPU frequency in GHz */
195+
# #define mkl_get_cpu_frequency MKL_Get_Cpu_Frequency
196+
# #define mkl_get_max_cpu_frequency MKL_Get_Max_Cpu_Frequency
197+
#
198+
# _Mkl_Api(void,MKL_Get_Cpu_Clocks,(unsigned MKL_INT64 *)) /* Gets CPU clocks */
199+
# _Mkl_Api(double,MKL_Get_Clocks_Frequency,(void)) /* Gets clocks frequency in GHz */
200+
# #define mkl_get_cpu_clocks MKL_Get_Cpu_Clocks
201+
# #define mkl_get_clocks_frequency MKL_Get_Clocks_Frequency
202+
203+
"""
204+
vml_get_cpu_frequency() :: Float64
205+
206+
Current CPU frequency in GHz, maybe less or more than [`vml_get_max_cpu_frequency`](@ref).
207+
"""
208+
vml_get_cpu_frequency() = ccall((:MKL_Get_Cpu_Frequency, MKL_jll.libmkl_rt), Cdouble, ())
209+
"""
210+
vml_get_max_cpu_frequency() :: Float64
211+
212+
Official CPU frequency in GHz, as per package specification. See also [`vml_get_cpu_frequency`](@ref).
213+
"""
214+
vml_get_max_cpu_frequency() = ccall((:MKL_Get_Max_Cpu_Frequency, MKL_jll.libmkl_rt), Cdouble, ())
215+
216+
# -----------------------------------------------------------------------------------------------
217+
218+
# mkl\include\mkl_vml_defines.h
219+
# ERROR STATUS MACROS
220+
# VML_STATUS_OK - no errors
221+
# VML_STATUS_BADSIZE - array dimension is not positive
222+
# VML_STATUS_BADMEM - invalid pointer passed
223+
# VML_STATUS_ERRDOM - at least one of arguments is out of function domain
224+
# VML_STATUS_SING - at least one of arguments caused singularity
225+
# VML_STATUS_OVERFLOW - at least one of arguments caused overflow
226+
# VML_STATUS_UNDERFLOW - at least one of arguments caused underflow
227+
# VML_STATUS_ACCURACYWARNING - function doesn't support set accuracy mode,
228+
# lower accuracy mode was used instead
229+
const VML_STATUS_OK = 0
230+
const VML_STATUS_BADSIZE = -1
231+
const VML_STATUS_BADMEM = -2
232+
const VML_STATUS_ERRDOM = 1
233+
const VML_STATUS_SING = 2
234+
const VML_STATUS_OVERFLOW = 3
235+
const VML_STATUS_UNDERFLOW = 4
236+
const VML_STATUS_ACCURACYWARNING = 1000
19237

20238
function vml_check_error()
21239
vml_error = ccall((:vmlClearErrStatus, MKL_jll.libmkl_rt), Cint, ())
22-
if vml_error != 0
23-
if vml_error == 1
240+
if vml_error != VML_STATUS_OK
241+
if vml_error == VML_STATUS_ERRDOM
24242
throw(DomainError(-1, "This function does not support arguments outside its domain"))
25-
elseif vml_error == 2 || vml_error == 3 || vml_error == 4
243+
elseif vml_error == VML_STATUS_SING || vml_error == VML_STATUS_OVERFLOW || vml_error == VML_STATUS_UNDERFLOW
26244
# Singularity, overflow, or underflow
27245
# I don't think Base throws on these
28-
elseif vml_error == 1000
246+
elseif vml_error == VML_STATUS_ACCURACYWARNING
29247
warn("IntelVectorMath does not support $(vml_get_accuracy); lower accuracy used instead")
30-
else
248+
else # VML_STATUS_BADSIZE or VML_STATUS_BADMEM
31249
error("an unexpected error occurred in IntelVectorMath ($vml_error)")
32250
end
33251
end

test/real.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,19 @@ end
5858
vml_set_accuracy(VML_EP)
5959
Test.@test vml_get_accuracy() == VML_EP
6060

61+
# Setting denormal
62+
vml_set_denormalmode(VML_DENORMAL_FAST)
63+
Test.@test vml_get_denormalmode() == VML_DENORMAL_FAST
64+
65+
vml_set_denormalmode(VML_DENORMAL_ACCURATE)
66+
Test.@test vml_get_denormalmode() == VML_DENORMAL_ACCURATE
67+
68+
# Setting number of threads (should have at least one 1)
69+
Test.@test vml_set_num_threads(1)
70+
Test.@test !vml_set_num_threads(-1)
71+
Test.@test vml_get_max_threads() == 1
72+
73+
Test.@test vml_set_num_threads(0)
74+
Test.@test vml_get_max_threads() >= 1
75+
6176
end

0 commit comments

Comments
 (0)