Better support for macOS, added Apple M1 Pro benchmarks

ProjectPhysX · ProjectPhysX · commit 10a16c727ef8 · 2022-09-19T18:33:21.000+02:00
diff --git a/README.md b/README.md
@@ -198,6 +198,7 @@ In consequence, the arithmetic intensity of this implementation is 2.13 (FP32/FP
 | Nvidia GeForce GTX 960M       |               1.51 |           4 |           80 |              442 (84%) |               872 (84%) |               627 (60%) |
 | Nvidia Quadro K2000           |               0.73 |           2 |           64 |              312 (75%) |               444 (53%) |               171 (21%) |
 | Nvidia GeForce GT 630 (OEM)   |               0.46 |           2 |           29 |              151 (81%) |               185 (50%) |                78 (21%) |
+| Apple M1 Pro 16-Core 16GB     |               4.10 |          11 |          200 |             1204 (92%) |              2329 (90%) |              1855 (71%) |
 | AMD Radeon Vega 8 Graphics    |               1.23 |           7 |           38 |              157 (63%) |               282 (57%) |               288 (58%) |
 | Intel UHD Graphics 630        |               0.46 |           7 |           51 |              151 (45%) |               301 (45%) |               187 (28%) |
 | Intel HD Graphics 5500        |               0.35 |           3 |           26 |               75 (45%) |               192 (58%) |               108 (32%) |
@@ -293,9 +294,9 @@ In consequence, the arithmetic intensity of this implementation is 2.13 (FP32/FP
 
 - <details><summary>I am a scientist/teacher with a paid position at a public institution. Can I use FluidX3D for my research/teaching?</summary><br>Yes, you can use FluidX3D free of charge. This is considered research/education, not commercial use. To give credit, the <a href="https://github.com/ProjectPhysX/FluidX3D#references">references</a> listed below should be cited. If you publish data/results generated by altered source versions, the altered source code must be published as well.<br><br></details>
 
-- <details><summary>I work for a company/startup in CFD/consulting/R&D or related fields. Can I use FluidX3D for work?</summary><br>No. Such commercial use is not allowed with the current license.<br><br></details>
+- <details><summary>I work at a company in CFD/consulting/R&D or related fields. Can I use FluidX3D commercially?</summary><br>No. Commercial use is not allowed with the current license.<br><br></details>
 
-- <details><summary>Is FluidX3D open-source?</summary><br>No. "Open-source" as a technical term is defined as freely available without any restriction on use, but I am not comfortable with that. I have written FluidX3D in my spare time and no one should milk it for profits while I remain uncompensated, especially considering what other CFD software sells for. The technical term for the type of license I choose is "source-available". The source code is freely available, and you are free to use, to alter and to redistribute it, as long as you do not sell it or make a profit from derived products/services, and as long as you do not use it for any military purposes (see the <a href="https://github.com/ProjectPhysX/FluidX3D/blob/master/LICENSE.md">license</a> for details).<br><br></details>
+- <details><summary>Is FluidX3D open-source?</summary><br>No. "Open-source" as a technical term is defined as freely available without any restriction on use, but I am not comfortable with that. I have written FluidX3D in my spare time and no one should milk it for profits while I remain uncompensated, especially considering what other CFD software sells for. The technical term for the type of license I choose is "source-available no-cost non-commercial". The source code is freely available, and you are free to use, to alter and to redistribute it, as long as you do not sell it or make a profit from derived products/services, and as long as you do not use it for any military purposes (see the <a href="https://github.com/ProjectPhysX/FluidX3D/blob/master/LICENSE.md">license</a> for details).<br><br></details>
 
 - <details><summary>Will FluidX3D at some point be available with a commercial license?</summary><br>Maybe I will add the option for a second, commercial license later on. If you are interested in commercial use, let me know. For non-commercial use in science and education, FluidX3D is and will always be free.<br><br></details>
 
diff --git a/make.sh b/make.sh
@@ -2,5 +2,6 @@
 mkdir -p bin # create directory for executable
 rm -f ./bin/FluidX3D.exe # prevent execution of old version if compiling fails
 g++ ./src/*.cpp -o ./bin/FluidX3D.exe -std=c++17 -pthread -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL # compile on Linux
+#g++ ./src/*.cpp -o ./bin/FluidX3D.exe -std=c++17 -pthread -I./src/OpenCL/include -framework OpenCL # compile on macOS
 #g++ ./src/*.cpp -o ./bin/FluidX3D.exe -std=c++17 -pthread -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL # compile on Android
 ./bin/FluidX3D.exe $1 # run FluidX3D
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -57,8 +57,9 @@ struct Device_Info {
 		const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere) or 64 cores/CU (P100, Volta, Turing, A100)
 		const float amd = (float)(contains_any(to_lower(vendor), {"amd", "advanced"}))*(is_gpu?(amd_128_cores_per_dualcu?128.0f:64.0f):0.5f); // AMD GPUs have 64 cores/CU (GCN, CDNA) or 128 cores/dualCU (RDNA, RDNA2), AMD CPUs (with SMT) have 1/2 core/CU
 		const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU
+		const float apple = (float)(contains(to_lower(vendor), "apple"))*(128.0f); // Apple ARM GPUs usually have 128 cores/CU
 		const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
-		cores = to_uint((float)compute_units*(nvidia+amd+intel+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
+		cores = to_uint((float)compute_units*(nvidia+amd+intel+apple+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 		tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
 	}
 	inline Device_Info() {}; // default constructor
@@ -77,7 +78,7 @@ inline void print_device_info(const Device_Info& d, const int id=-1) { // print
 	println("| Buffer Limits  | "+alignl(58, to_string(d.max_global_buffer)+" MB global, "+to_string(d.max_constant_buffer)+" KB constant")+" |");
 	println("|----------------'------------------------------------------------------------|");
 }
-inline vector<Device_Info> get_devices() { // returns a vector of all available OpenCL devices
+inline vector<Device_Info> get_devices(const bool print_info=true) { // returns a vector of all available OpenCL devices
 	vector<Device_Info> devices; // get all devices of all platforms
 	vector<cl::Platform> cl_platforms; // get all platforms (drivers)
 	cl::Platform::get(&cl_platforms);
@@ -91,14 +92,14 @@ inline vector<Device_Info> get_devices() { // returns a vector of all available
 	if((uint)cl_platforms.size()==0u||(uint)devices.size()==0u) {
 		print_error("There are no OpenCL devices available. Make sure that the OpenCL 1.2 Runtime for your device is installed. For GPUs it comes by default with the graphics driver, for CPUs it has to be installed separately.");
 	}
-	println("\r|----------------.------------------------------------------------------------|");
-	for(uint i=0u; i<(uint)devices.size(); i++) {
-		println("| Device ID "+alignr(4u, i)+" | "+alignl(58u, devices[i].name)+" |");
+	if(print_info) {
+		println("\r|----------------.------------------------------------------------------------|");
+		for(uint i=0u; i<(uint)devices.size(); i++) println("| Device ID "+alignr(4u, i)+" | "+alignl(58u, devices[i].name)+" |");
+		println("|----------------'------------------------------------------------------------|");
 	}
-	println("|----------------'------------------------------------------------------------|");
 	return devices;
 }
-inline Device_Info select_device_with_most_flops(const vector<Device_Info>& devices=get_devices()) { // returns device with best floating-point performance
+inline Device_Info select_device_with_most_flops(const vector<Device_Info>& devices=get_devices(), const bool print_info=true) { // returns device with best floating-point performance
 	float best_value = 0.0f;
 	uint best_i = 0u;
 	for(uint i=0u; i<(uint)devices.size(); i++) { // find device with highest (estimated) floating point performance
@@ -107,10 +108,10 @@ inline Device_Info select_device_with_most_flops(const vector<Device_Info>& devi
 			best_i = i;
 		}
 	}
-	print_device_info(devices[best_i], best_i);
+	if(print_info) print_device_info(devices[best_i], best_i);
 	return devices[best_i];
 }
-inline Device_Info select_device_with_most_memory(const vector<Device_Info>& devices=get_devices()) { // returns device with largest memory capacity
+inline Device_Info select_device_with_most_memory(const vector<Device_Info>& devices=get_devices(), const bool print_info=true) { // returns device with largest memory capacity
 	uint best_value = 0u;
 	uint best_i = 0u;
 	for(uint i=0u; i<(uint)devices.size(); i++) { // find device with most memory
@@ -119,12 +120,12 @@ inline Device_Info select_device_with_most_memory(const vector<Device_Info>& dev
 			best_i = i;
 		}
 	}
-	print_device_info(devices[best_i], best_i);
+	if(print_info) print_device_info(devices[best_i], best_i);
 	return devices[best_i];
 }
-inline Device_Info select_device_with_id(const uint id, const vector<Device_Info>& devices=get_devices()) { // returns device with specified ID
+inline Device_Info select_device_with_id(const uint id, const vector<Device_Info>& devices=get_devices(), const bool print_info=true) { // returns device with specified ID
 	if(id<(uint)devices.size()) {
-		print_device_info(devices[id], id);
+		if(print_info) print_device_info(devices[id], id);
 		return devices[id];
 	} else {
 		print_error("Your selected Device ID ("+to_string(id)+") is wrong.");
@@ -471,6 +472,12 @@ template<typename T> class Memory {
 	inline void enqueue_write_to_device() {
 		write_to_device(false);
 	}
+	inline void enqueue_read_from_device(const ulong offset, const ulong length) {
+		read_from_device(offset, length, false);
+	}
+	inline void enqueue_write_to_device(const ulong offset, const ulong length) {
+		write_to_device(offset, length, false);
+	}
 	inline void finish_queue() {
 		cl_queue.finish();
 	}
diff --git a/src/utilities.hpp b/src/utilities.hpp
@@ -60,9 +60,9 @@ class Clock {
 	typedef std::chrono::high_resolution_clock clock;
 	std::chrono::time_point<clock> t;
 public:
-	Clock() { start(); }
-	void start() { t = clock::now(); }
-	double stop() const { return std::chrono::duration_cast<std::chrono::duration<double>>(clock::now()-t).count(); }
+	inline Clock() { start(); }
+	inline void start() { t = clock::now(); }
+	inline double stop() const { return std::chrono::duration_cast<std::chrono::duration<double>>(clock::now()-t).count(); }
 };
 inline void sleep(const double t) {
 	if(t>0.0) std::this_thread::sleep_for(std::chrono::milliseconds((int)(1E3*t+0.5)));
@@ -3010,10 +3010,10 @@ inline uint hsv_to_rgb(const float3& hsv) {
 #include <Windows.h> // for displaying colors and getting console size
 #undef min
 #undef max
-#elif defined(__linux__)
+#elif defined(__linux__)||defined(__APPLE__)
 #include <sys/ioctl.h> // for getting console size
 #include <unistd.h> // for getting path of executable
-#else // Windows/Linux
+#else // Linux
 #undef UTILITIES_CONSOLE_COLOR
 #endif // Windows/Linux
 #endif // UTILITIES_CONSOLE_COLOR
@@ -3042,7 +3042,7 @@ inline string get_exe_path() { // returns path where executable is located, ends
 	std::wstring ws(wc);
 	transform(ws.begin(), ws.end(), back_inserter(path), [](wchar_t c) { return (char)c; });
 	path = replace(path, "\\", "/");
-#elif defined(__linux__)
+#else // Linux
 	char c[260];
 	int length = (int)readlink("/proc/self/exe", c, 260);
 	path = string(c, length>0 ? length : 0);
@@ -3056,7 +3056,7 @@ inline void get_console_size(uint& width, uint& height) {
 	GetConsoleScreenBufferInfo(handle, &csbi);
 	width = (uint)(csbi.srWindow.Right-csbi.srWindow.Left+1); // (uint)(csbi.dwSize.X); gives size of screen buffer
 	height = (uint)(csbi.srWindow.Bottom-csbi.srWindow.Top+1); // (uint)(csbi.dwSize.Y); gives size of screen buffer
-#elif defined(__linux__)
+#else // Linux
 	struct winsize w;
 	ioctl(fileno(stdout), TIOCGWINSZ, &w);
 	width = (uint)(w.ws_col);
@@ -3070,7 +3070,7 @@ inline void get_console_font_size(uint& width, uint& height) {
 	GetCurrentConsoleFont(handle, false, &cfi);
 	width = (uint)(cfi.dwFontSize.X);
 	height = (uint)(cfi.dwFontSize.Y);
-#elif defined(__linux__)
+#else // Linux
 	//struct winsize w;
 	//ioctl(fileno(stdout), TIOCGWINSZ, &w);
 	width = 8u;//(uint)(w.ws_xpixel/w.ws_col);
@@ -3081,7 +3081,7 @@ inline void set_console_cursor(const uint x, const uint y) {
 #if defined(_WIN32)
 	static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
 	SetConsoleCursorPosition(handle, {(short)x, (short)y});
-#elif defined(__linux__)
+#else // Linux
 	std::cout << "\033["+to_string(y+1u)+";"+to_string(x+1u)+"f";
 #endif // Windows/Linux
 }
@@ -3092,7 +3092,7 @@ inline void show_console_cursor(const bool show) {
 	GetConsoleCursorInfo(handle, &cci);
 	cci.bVisible = show; // show/hide cursor
 	SetConsoleCursorInfo(handle, &cci);
-#elif defined(__linux__)
+#else // Linux
 	std::cout << (show ? "\033[?25h" : "\033[?25l"); // show/hide cursor
 #endif // Windows/Linux
 }
@@ -3108,7 +3108,7 @@ inline void clear_console() {
 	FillConsoleOutputCharacter(handle, TEXT(' '), length, topLeft, &written); // flood-fill the console with spaces to clear it
 	FillConsoleOutputAttribute(handle, csbi.wAttributes, length, topLeft, &written); // reset attributes of every character to default, this clears all background colour formatting
 	SetConsoleCursorPosition(handle, topLeft); // move the cursor back to the top left for the next sequence of writes
-#elif defined(__linux__)
+#else // Linux
 	std::cout << "\033[2J";
 #endif // Windows/Linux
 }
@@ -3226,23 +3226,23 @@ inline void print_color(const int textcolor) {
 #if defined(_WIN32)
 	static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
 	SetConsoleTextAttribute(handle, textcolor);
-#elif defined(__linux__)
+#else // Linux
 	std::cout << get_print_color(textcolor);
 #endif // Windows/Linux
 }
 inline void print_color(const int textcolor, const int backgroundcolor) {
 #if defined(_WIN32)
 	static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
 	SetConsoleTextAttribute(handle, backgroundcolor<<4|textcolor);
-#elif defined(__linux__)
+#else // Linux
 	std::cout << get_print_color(textcolor, backgroundcolor);
 #endif // Windows/Linux
 }
 inline void print_color_reset() {
 #if defined(_WIN32)
 	static const HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
 	SetConsoleTextAttribute(handle, 7); // reset color
-#elif defined(__linux__)
+#else // Linux
 	std::cout << "\033[0m"; // reset color
 #endif // Windows/Linux
 }
@@ -3299,7 +3299,7 @@ inline void print_image(const Image* image, const uint textwidth=0u, const uint
 		print(segment, ltc, lbc); // print last segment, then reset color
 		println();
 	}
-#elif defined(__linux__)
+#else // Linux
 	const string s = "\u2580"; // trick to double vertical resolution: use graphic character
 	string r = ""; // append color changes to a string, print string in the end (much faster)
 	for(uint y=0u; y<newheight-1u; y+=2u) {
@@ -3338,7 +3338,7 @@ inline void print_image_bw(const Image* image, const uint textwidth=0u, const ui
 	const string ww = string("")+(char)219; // trick to double vertical resolution: use graphic characters
 	const string bw = string("")+(char)220;
 	const string wb = string("")+(char)223;
-#elif defined(__linux__)
+#else // Linux
 	const string ww = "\u2588"; // trick to double vertical resolution: use graphic characters
 	const string bw = "\u2584";
 	const string wb = "\u2580";
@@ -3418,7 +3418,7 @@ inline void print_image_dither(const Image* image, const uint textwidth=0u, cons
 		print(segment, ltc, lbc); // print last segment, then reset color
 		println();
 	}
-#elif defined(__linux__)
+#else // Linux
 	const string s[3] = { "\u2591", "\u2592", "\u2593" };
 	string r = ""; // append color changes to a string, print string in the end (much faster)
 	for(uint y=0u; y<newheight; y++) {
@@ -3507,7 +3507,7 @@ inline void print_video_dither(const Image* image, const uint textwidth=0u, cons
 			ly = y;
 		}
 		print(segment, ltc, lbc); // print last segment, then reset color
-#elif defined(__linux__)
+#else // Linux
 		const string s[3] = { "\u2591", "\u2592", "\u2593" };
 		string r = ""; // append color changes to a string, print string in the end (much faster)
 		uint lx=max_uint, ly=max_uint;
@@ -3570,7 +3570,7 @@ inline Image* screenshot(Image* image=nullptr) {
 inline void print_color_test() {
 #ifdef _WIN32
 	const string s = string("")+(char)223; // trick to double vertical resolution: use graphic character
-#elif defined(__linux__)
+#else // Linux
 	const string s = "\u2580"; // trick to double vertical resolution: use graphic character
 #endif // Windows/Linux
 	print(s, color_magenta   , color_black     );
@@ -3646,7 +3646,7 @@ inline int key_press() { // not working: F11 (-122, toggles fullscreen)
 		}
 	}
 }
-#elif defined(__linux__)
+#else // Linux
 #include <termios.h>
 inline int key_press() { // not working: � (251), num lock (-144), caps lock (-20), windows key (-91), kontext menu key (-93)
 	struct termios term;