Skip to content

Commit 10a16c7

Browse files
committed
Better support for macOS, added Apple M1 Pro benchmarks
1 parent 5e25397 commit 10a16c7

File tree

4 files changed

+43
-34
lines changed

4 files changed

+43
-34
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ In consequence, the arithmetic intensity of this implementation is 2.13 (FP32/FP
198198
| Nvidia GeForce GTX 960M | 1.51 | 4 | 80 | 442 (84%) | 872 (84%) | 627 (60%) |
199199
| Nvidia Quadro K2000 | 0.73 | 2 | 64 | 312 (75%) | 444 (53%) | 171 (21%) |
200200
| Nvidia GeForce GT 630 (OEM) | 0.46 | 2 | 29 | 151 (81%) | 185 (50%) | 78 (21%) |
201+
| Apple M1 Pro 16-Core 16GB | 4.10 | 11 | 200 | 1204 (92%) | 2329 (90%) | 1855 (71%) |
201202
| AMD Radeon Vega 8 Graphics | 1.23 | 7 | 38 | 157 (63%) | 282 (57%) | 288 (58%) |
202203
| Intel UHD Graphics 630 | 0.46 | 7 | 51 | 151 (45%) | 301 (45%) | 187 (28%) |
203204
| Intel HD Graphics 5500 | 0.35 | 3 | 26 | 75 (45%) | 192 (58%) | 108 (32%) |
@@ -293,9 +294,9 @@ In consequence, the arithmetic intensity of this implementation is 2.13 (FP32/FP
293294

294295
- <details><summary>I am a scientist/teacher with a paid position at a public institution. Can I use FluidX3D for my research/teaching?</summary><br>Yes, you can use FluidX3D free of charge. This is considered research/education, not commercial use. To give credit, the <a href="https://github.com/ProjectPhysX/FluidX3D#references">references</a> listed below should be cited. If you publish data/results generated by altered source versions, the altered source code must be published as well.<br><br></details>
295296

296-
- <details><summary>I work for a company/startup in CFD/consulting/R&D or related fields. Can I use FluidX3D for work?</summary><br>No. Such commercial use is not allowed with the current license.<br><br></details>
297+
- <details><summary>I work at a company in CFD/consulting/R&D or related fields. Can I use FluidX3D commercially?</summary><br>No. Commercial use is not allowed with the current license.<br><br></details>
297298

298-
- <details><summary>Is FluidX3D open-source?</summary><br>No. "Open-source" as a technical term is defined as freely available without any restriction on use, but I am not comfortable with that. I have written FluidX3D in my spare time and no one should milk it for profits while I remain uncompensated, especially considering what other CFD software sells for. The technical term for the type of license I choose is "source-available". The source code is freely available, and you are free to use, to alter and to redistribute it, as long as you do not sell it or make a profit from derived products/services, and as long as you do not use it for any military purposes (see the <a href="https://github.com/ProjectPhysX/FluidX3D/blob/master/LICENSE.md">license</a> for details).<br><br></details>
299+
- <details><summary>Is FluidX3D open-source?</summary><br>No. "Open-source" as a technical term is defined as freely available without any restriction on use, but I am not comfortable with that. I have written FluidX3D in my spare time and no one should milk it for profits while I remain uncompensated, especially considering what other CFD software sells for. The technical term for the type of license I choose is "source-available no-cost non-commercial". The source code is freely available, and you are free to use, to alter and to redistribute it, as long as you do not sell it or make a profit from derived products/services, and as long as you do not use it for any military purposes (see the <a href="https://github.com/ProjectPhysX/FluidX3D/blob/master/LICENSE.md">license</a> for details).<br><br></details>
299300

300301
- <details><summary>Will FluidX3D at some point be available with a commercial license?</summary><br>Maybe I will add the option for a second, commercial license later on. If you are interested in commercial use, let me know. For non-commercial use in science and education, FluidX3D is and will always be free.<br><br></details>
301302

make.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22
mkdir -p bin # create directory for executable
33
rm -f ./bin/FluidX3D.exe # prevent execution of old version if compiling fails
44
g++ ./src/*.cpp -o ./bin/FluidX3D.exe -std=c++17 -pthread -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL # compile on Linux
5+
#g++ ./src/*.cpp -o ./bin/FluidX3D.exe -std=c++17 -pthread -I./src/OpenCL/include -framework OpenCL # compile on macOS
56
#g++ ./src/*.cpp -o ./bin/FluidX3D.exe -std=c++17 -pthread -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL # compile on Android
67
./bin/FluidX3D.exe $1 # run FluidX3D

src/opencl.hpp

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ struct Device_Info {
5757
const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere) or 64 cores/CU (P100, Volta, Turing, A100)
5858
const float amd = (float)(contains_any(to_lower(vendor), {"amd", "advanced"}))*(is_gpu?(amd_128_cores_per_dualcu?128.0f:64.0f):0.5f); // AMD GPUs have 64 cores/CU (GCN, CDNA) or 128 cores/dualCU (RDNA, RDNA2), AMD CPUs (with SMT) have 1/2 core/CU
5959
const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU
60+
const float apple = (float)(contains(to_lower(vendor), "apple"))*(128.0f); // Apple ARM GPUs usually have 128 cores/CU
6061
const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
61-
cores = to_uint((float)compute_units*(nvidia+amd+intel+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
62+
cores = to_uint((float)compute_units*(nvidia+amd+intel+apple+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
6263
tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
6364
}
6465
inline Device_Info() {}; // default constructor
@@ -77,7 +78,7 @@ inline void print_device_info(const Device_Info& d, const int id=-1) { // print
7778
println("| Buffer Limits | "+alignl(58, to_string(d.max_global_buffer)+" MB global, "+to_string(d.max_constant_buffer)+" KB constant")+" |");
7879
println("|----------------'------------------------------------------------------------|");
7980
}
80-
inline vector<Device_Info> get_devices() { // returns a vector of all available OpenCL devices
81+
inline vector<Device_Info> get_devices(const bool print_info=true) { // returns a vector of all available OpenCL devices
8182
vector<Device_Info> devices; // get all devices of all platforms
8283
vector<cl::Platform> cl_platforms; // get all platforms (drivers)
8384
cl::Platform::get(&cl_platforms);
@@ -91,14 +92,14 @@ inline vector<Device_Info> get_devices() { // returns a vector of all available
9192
if((uint)cl_platforms.size()==0u||(uint)devices.size()==0u) {
9293
print_error("There are no OpenCL devices available. Make sure that the OpenCL 1.2 Runtime for your device is installed. For GPUs it comes by default with the graphics driver, for CPUs it has to be installed separately.");
9394
}
94-
println("\r|----------------.------------------------------------------------------------|");
95-
for(uint i=0u; i<(uint)devices.size(); i++) {
96-
println("| Device ID "+alignr(4u, i)+" | "+alignl(58u, devices[i].name)+" |");
95+
if(print_info) {
96+
println("\r|----------------.------------------------------------------------------------|");
97+
for(uint i=0u; i<(uint)devices.size(); i++) println("| Device ID "+alignr(4u, i)+" | "+alignl(58u, devices[i].name)+" |");
98+
println("|----------------'------------------------------------------------------------|");
9799
}
98-
println("|----------------'------------------------------------------------------------|");
99100
return devices;
100101
}
101-
inline Device_Info select_device_with_most_flops(const vector<Device_Info>& devices=get_devices()) { // returns device with best floating-point performance
102+
inline Device_Info select_device_with_most_flops(const vector<Device_Info>& devices=get_devices(), const bool print_info=true) { // returns device with best floating-point performance
102103
float best_value = 0.0f;
103104
uint best_i = 0u;
104105
for(uint i=0u; i<(uint)devices.size(); i++) { // find device with highest (estimated) floating point performance
@@ -107,10 +108,10 @@ inline Device_Info select_device_with_most_flops(const vector<Device_Info>& devi
107108
best_i = i;
108109
}
109110
}
110-
print_device_info(devices[best_i], best_i);
111+
if(print_info) print_device_info(devices[best_i], best_i);
111112
return devices[best_i];
112113
}
113-
inline Device_Info select_device_with_most_memory(const vector<Device_Info>& devices=get_devices()) { // returns device with largest memory capacity
114+
inline Device_Info select_device_with_most_memory(const vector<Device_Info>& devices=get_devices(), const bool print_info=true) { // returns device with largest memory capacity
114115
uint best_value = 0u;
115116
uint best_i = 0u;
116117
for(uint i=0u; i<(uint)devices.size(); i++) { // find device with most memory
@@ -119,12 +120,12 @@ inline Device_Info select_device_with_most_memory(const vector<Device_Info>& dev
119120
best_i = i;
120121
}
121122
}
122-
print_device_info(devices[best_i], best_i);
123+
if(print_info) print_device_info(devices[best_i], best_i);
123124
return devices[best_i];
124125
}
125-
inline Device_Info select_device_with_id(const uint id, const vector<Device_Info>& devices=get_devices()) { // returns device with specified ID
126+
inline Device_Info select_device_with_id(const uint id, const vector<Device_Info>& devices=get_devices(), const bool print_info=true) { // returns device with specified ID
126127
if(id<(uint)devices.size()) {
127-
print_device_info(devices[id], id);
128+
if(print_info) print_device_info(devices[id], id);
128129
return devices[id];
129130
} else {
130131
print_error("Your selected Device ID ("+to_string(id)+") is wrong.");
@@ -471,6 +472,12 @@ template<typename T> class Memory {
471472
inline void enqueue_write_to_device() {
472473
write_to_device(false);
473474
}
475+
inline void enqueue_read_from_device(const ulong offset, const ulong length) {
476+
read_from_device(offset, length, false);
477+
}
478+
inline void enqueue_write_to_device(const ulong offset, const ulong length) {
479+
write_to_device(offset, length, false);
480+
}
474481
inline void finish_queue() {
475482
cl_queue.finish();
476483
}

src/utilities.hpp

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ class Clock {
6060
typedef std::chrono::high_resolution_clock clock;
6161
std::chrono::time_point<clock> t;
6262
public:
63-
Clock() { start(); }
64-
void start() { t = clock::now(); }
65-
double stop() const { return std::chrono::duration_cast<std::chrono::duration<double>>(clock::now()-t).count(); }
63+
inline Clock() { start(); }
64+
inline void start() { t = clock::now(); }
65+
inline double stop() const { return std::chrono::duration_cast<std::chrono::duration<double>>(clock::now()-t).count(); }
6666
};
6767
inline void sleep(const double t) {
6868
if(t>0.0) std::this_thread::sleep_for(std::chrono::milliseconds((int)(1E3*t+0.5)));
@@ -3010,10 +3010,10 @@ inline uint hsv_to_rgb(const float3& hsv) {
30103010
#include <Windows.h> // for displaying colors and getting console size
30113011
#undef min
30123012
#undef max
3013-
#elif defined(__linux__)
3013+
#elif defined(__linux__)||defined(__APPLE__)
30143014
#include <sys/ioctl.h> // for getting console size
30153015
#include <unistd.h> // for getting path of executable
3016-
#else // Windows/Linux
3016+
#else // Linux
30173017
#undef UTILITIES_CONSOLE_COLOR
30183018
#endif // Windows/Linux
30193019
#endif // UTILITIES_CONSOLE_COLOR
@@ -3042,7 +3042,7 @@ inline string get_exe_path() { // returns path where executable is located, ends
30423042
std::wstring ws(wc);
30433043
transform(ws.begin(), ws.end(), back_inserter(path), [](wchar_t c) { return (char)c; });
30443044
path = replace(path, "\\", "/");
3045-
#elif defined(__linux__)
3045+
#else // Linux
30463046
char c[260];
30473047
int length = (int)readlink("/proc/self/exe", c, 260);
30483048
path = string(c, length>0 ? length : 0);
@@ -3056,7 +3056,7 @@ inline void get_console_size(uint& width, uint& height) {
30563056
GetConsoleScreenBufferInfo(handle, &csbi);
30573057
width = (uint)(csbi.srWindow.Right-csbi.srWindow.Left+1); // (uint)(csbi.dwSize.X); gives size of screen buffer
30583058
height = (uint)(csbi.srWindow.Bottom-csbi.srWindow.Top+1); // (uint)(csbi.dwSize.Y); gives size of screen buffer
3059-
#elif defined(__linux__)
3059+
#else // Linux
30603060
struct winsize w;
30613061
ioctl(fileno(stdout), TIOCGWINSZ, &w);
30623062
width = (uint)(w.ws_col);
@@ -3070,7 +3070,7 @@ inline void get_console_font_size(uint& width, uint& height) {
30703070
GetCurrentConsoleFont(handle, false, &cfi);
30713071
width = (uint)(cfi.dwFontSize.X);
30723072
height = (uint)(cfi.dwFontSize.Y);
3073-
#elif defined(__linux__)
3073+
#else // Linux
30743074
//struct winsize w;
30753075
//ioctl(fileno(stdout), TIOCGWINSZ, &w);
30763076
width = 8u;//(uint)(w.ws_xpixel/w.ws_col);
@@ -3081,7 +3081,7 @@ inline void set_console_cursor(const uint x, const uint y) {
30813081
#if defined(_WIN32)
30823082
static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
30833083
SetConsoleCursorPosition(handle, {(short)x, (short)y});
3084-
#elif defined(__linux__)
3084+
#else // Linux
30853085
std::cout << "\033["+to_string(y+1u)+";"+to_string(x+1u)+"f";
30863086
#endif // Windows/Linux
30873087
}
@@ -3092,7 +3092,7 @@ inline void show_console_cursor(const bool show) {
30923092
GetConsoleCursorInfo(handle, &cci);
30933093
cci.bVisible = show; // show/hide cursor
30943094
SetConsoleCursorInfo(handle, &cci);
3095-
#elif defined(__linux__)
3095+
#else // Linux
30963096
std::cout << (show ? "\033[?25h" : "\033[?25l"); // show/hide cursor
30973097
#endif // Windows/Linux
30983098
}
@@ -3108,7 +3108,7 @@ inline void clear_console() {
31083108
FillConsoleOutputCharacter(handle, TEXT(' '), length, topLeft, &written); // flood-fill the console with spaces to clear it
31093109
FillConsoleOutputAttribute(handle, csbi.wAttributes, length, topLeft, &written); // reset attributes of every character to default, this clears all background colour formatting
31103110
SetConsoleCursorPosition(handle, topLeft); // move the cursor back to the top left for the next sequence of writes
3111-
#elif defined(__linux__)
3111+
#else // Linux
31123112
std::cout << "\033[2J";
31133113
#endif // Windows/Linux
31143114
}
@@ -3226,23 +3226,23 @@ inline void print_color(const int textcolor) {
32263226
#if defined(_WIN32)
32273227
static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
32283228
SetConsoleTextAttribute(handle, textcolor);
3229-
#elif defined(__linux__)
3229+
#else // Linux
32303230
std::cout << get_print_color(textcolor);
32313231
#endif // Windows/Linux
32323232
}
32333233
inline void print_color(const int textcolor, const int backgroundcolor) {
32343234
#if defined(_WIN32)
32353235
static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
32363236
SetConsoleTextAttribute(handle, backgroundcolor<<4|textcolor);
3237-
#elif defined(__linux__)
3237+
#else // Linux
32383238
std::cout << get_print_color(textcolor, backgroundcolor);
32393239
#endif // Windows/Linux
32403240
}
32413241
inline void print_color_reset() {
32423242
#if defined(_WIN32)
32433243
static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
32443244
SetConsoleTextAttribute(handle, 7); // reset color
3245-
#elif defined(__linux__)
3245+
#else // Linux
32463246
std::cout << "\033[0m"; // reset color
32473247
#endif // Windows/Linux
32483248
}
@@ -3299,7 +3299,7 @@ inline void print_image(const Image* image, const uint textwidth=0u, const uint
32993299
print(segment, ltc, lbc); // print last segment, then reset color
33003300
println();
33013301
}
3302-
#elif defined(__linux__)
3302+
#else // Linux
33033303
const string s = "\u2580"; // trick to double vertical resolution: use graphic character
33043304
string r = ""; // append color changes to a string, print string in the end (much faster)
33053305
for(uint y=0u; y<newheight-1u; y+=2u) {
@@ -3338,7 +3338,7 @@ inline void print_image_bw(const Image* image, const uint textwidth=0u, const ui
33383338
const string ww = string("")+(char)219; // trick to double vertical resolution: use graphic characters
33393339
const string bw = string("")+(char)220;
33403340
const string wb = string("")+(char)223;
3341-
#elif defined(__linux__)
3341+
#else // Linux
33423342
const string ww = "\u2588"; // trick to double vertical resolution: use graphic characters
33433343
const string bw = "\u2584";
33443344
const string wb = "\u2580";
@@ -3418,7 +3418,7 @@ inline void print_image_dither(const Image* image, const uint textwidth=0u, cons
34183418
print(segment, ltc, lbc); // print last segment, then reset color
34193419
println();
34203420
}
3421-
#elif defined(__linux__)
3421+
#else // Linux
34223422
const string s[3] = { "\u2591", "\u2592", "\u2593" };
34233423
string r = ""; // append color changes to a string, print string in the end (much faster)
34243424
for(uint y=0u; y<newheight; y++) {
@@ -3507,7 +3507,7 @@ inline void print_video_dither(const Image* image, const uint textwidth=0u, cons
35073507
ly = y;
35083508
}
35093509
print(segment, ltc, lbc); // print last segment, then reset color
3510-
#elif defined(__linux__)
3510+
#else // Linux
35113511
const string s[3] = { "\u2591", "\u2592", "\u2593" };
35123512
string r = ""; // append color changes to a string, print string in the end (much faster)
35133513
uint lx=max_uint, ly=max_uint;
@@ -3570,7 +3570,7 @@ inline Image* screenshot(Image* image=nullptr) {
35703570
inline void print_color_test() {
35713571
#ifdef _WIN32
35723572
const string s = string("")+(char)223; // trick to double vertical resolution: use graphic character
3573-
#elif defined(__linux__)
3573+
#else // Linux
35743574
const string s = "\u2580"; // trick to double vertical resolution: use graphic character
35753575
#endif // Windows/Linux
35763576
print(s, color_magenta , color_black );
@@ -3646,7 +3646,7 @@ inline int key_press() { // not working: F11 (-122, toggles fullscreen)
36463646
}
36473647
}
36483648
}
3649-
#elif defined(__linux__)
3649+
#else // Linux
36503650
#include <termios.h>
36513651
inline int key_press() { // not working: ¹ (251), num lock (-144), caps lock (-20), windows key (-91), kontext menu key (-93)
36523652
struct termios term;

0 commit comments

Comments
 (0)