diff --git a/.gitignore b/.gitignore index 51f2395..26ffa26 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ node_modules/ .DS_Store build/ .docusaurus - +target/ diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock new file mode 100644 index 0000000..af27d11 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock @@ -0,0 +1,2189 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "ar" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d67af77d68a931ecd5cbd8a3b5987d63a1d1d1278f7f6a60ae33db485cdebb69" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "ash" +version = "0.37.3+1.3.251" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e9c3835d686b0a6084ab4234fcd1b07dbf6e4767dce60874b12356a25ecd4a" +dependencies = [ + "libloading 0.7.4", +] + +[[package]] +name = "ash" +version = "0.38.0+1.3.281" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bb44936d800fea8f016d7f2311c6a4f97aebd5dc86f09906139ec848cf3a46f" +dependencies = [ + "libloading 0.8.5", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "benches" +version = "0.1.0" +dependencies = [ + "criterion", + "futures", + "matmul", + "rand", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "block" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a" + +[[package]] +name = "blog" +version = "0.1.0" +dependencies = [ + "futures", + "matmul", + "settings", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cfg_aliases" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" + +[[package]] +name = "codespan-reporting" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", + "unicode-width", +] + +[[package]] +name = "compiled_isomorphic" +version = "0.1.0" +dependencies = [ + "spirv-builder", +] + +[[package]] +name = "compiled_naive" +version = "0.1.0" +dependencies = [ + "spirv-builder", +] + +[[package]] +name = "compiled_tiling_1d" +version = "0.1.0" +dependencies = [ + "spirv-builder", +] + +[[package]] +name = "compiled_tiling_2d_simd" +version = "0.1.0" +dependencies = [ + "spirv-builder", +] + +[[package]] +name = "compiled_workgroup_256" +version = "0.1.0" +dependencies = [ + "spirv-builder", +] + +[[package]] +name = "compiled_workgroup_2d" +version = "0.1.0" +dependencies = [ + "spirv-builder", +] + +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "core-graphics-types" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45390e6114f68f718cc7a830514a96f903cccd70d02a8f6d9f643ac4ba45afaf" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "libc", +] + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "derive_more" +version = "0.99.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.87", +] + +[[package]] +name = "document-features" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb6969eaabd2421f8a2775cfd2471a2b634372b4a25d41e3bd647b79912850a0" +dependencies = [ + "litrs", +] + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "elsa" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98e71ae4df57d214182a2e5cb90230c0192c6ddfcaa05c36453d46a54713e10" +dependencies = [ + "indexmap 2.6.0", + "stable_deref_trait", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "foreign-types" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" +dependencies = [ + "foreign-types-macros", + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-macros" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "foreign-types-shared" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b" + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gl_generator" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a95dfc23a2b4a9a2f5ab41d194f8bfda3cabec42af4e39f08c339eb2a0c124d" +dependencies = [ + "khronos_api", + "log", + "xml-rs", +] + +[[package]] +name = "glam" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc46dd3ec48fdd8e693a98d2b8bafae273a2d54c1de02a2a7e3d57d501f39677" +dependencies = [ + "bytemuck", + "libm", +] + +[[package]] +name = "glow" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d51fa363f025f5c111e03f13eda21162faeacb6911fe8caa0c0349f9cf0c4483" +dependencies = [ + "js-sys", + "slotmap", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "glutin_wgl_sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a4e1951bbd9434a81aa496fe59ccc2235af3820d27b85f9314e279609211e2c" +dependencies = [ + "gl_generator", +] + +[[package]] +name = "gpu-alloc" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbcd2dba93594b227a1f57ee09b8b9da8892c34d55aa332e034a228d0fe6a171" +dependencies = [ + "bitflags 2.6.0", + "gpu-alloc-types", +] + +[[package]] +name = "gpu-alloc-types" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98ff03b468aa837d70984d55f5d3f846f6ec31fe34bbb97c4f85219caeee1ca4" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "gpu-allocator" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c151a2a5ef800297b4e79efa4f4bec035c5f51d5ae587287c9b952bdf734cacd" +dependencies = [ + "log", + "presser", + "thiserror", + "windows", +] + +[[package]] +name = "gpu-descriptor" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c08c1f623a8d0b722b8b99f821eb0ba672a1618f0d3b16ddbee1cedd2dd8557" +dependencies = [ + "bitflags 2.6.0", + "gpu-descriptor-types", + "hashbrown 0.14.5", +] + +[[package]] +name = "gpu-descriptor-types" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdf242682df893b86f33a73828fb09ca4b2d3bb6cc95249707fc684d27484b91" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash 0.7.8", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.11", + "allocator-api2", +] + +[[package]] +name = "hashbrown" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" + +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + +[[package]] +name = "hexf-parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df" + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown 0.15.1", +] + +[[package]] +name = "internal-iterator" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "969ee3fc68ec2e88eb21434ce4d9b7e1600d1ce92ff974560a6c4a304f5124b9" + +[[package]] +name = "is-terminal" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "isomorphic" +version = "0.1.0" +dependencies = [ + "glam", + "settings", + "spirv-std", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "khronos-egl" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6aae1df220ece3c0ada96b8153459b67eebe9ae9212258bb0134ae60416fdf76" +dependencies = [ + "libc", + "libloading 0.8.5", + "pkg-config", +] + +[[package]] +name = "khronos_api" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2db585e1d738fc771bf08a151420d3ed193d9d895a36df7f6f8a9456b911ddc" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "libloading" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" +dependencies = [ + "cfg-if", + "windows-targets", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "litrs" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ce301924b7887e9d637144fdade93f9dfff9b60981d4ac161db09720d39aa5" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "longest-increasing-subsequence" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3bd0dd2cd90571056fdb71f6275fada10131182f84899f4b2a916e565d81d86" + +[[package]] +name = "malloc_buf" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb" +dependencies = [ + "libc", +] + +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + +[[package]] +name = "matmul" +version = "0.1.0" +dependencies = [ + "ash 0.37.3+1.3.251", + "bytemuck", + "compiled_isomorphic", + "compiled_naive", + "compiled_tiling_1d", + "compiled_tiling_2d_simd", + "compiled_workgroup_256", + "compiled_workgroup_2d", + "futures", + "glam", + "isomorphic", + "rayon", + "settings", + "tracing", + "wgpu", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "metal" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ecfd3296f8c56b7c1f6fbac3c71cefa9d78ce009850c45000015f206dc7fa21" +dependencies = [ + "bitflags 2.6.0", + "block", + "core-graphics-types", + "foreign-types", + "log", + "objc", + "paste", +] + +[[package]] +name = "naga" +version = "23.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d5941e45a15b53aad4375eedf02033adb7a28931eedc31117faffa52e6a857e" +dependencies = [ + "arrayvec", + "bit-set", + "bitflags 2.6.0", + "cfg_aliases", + "codespan-reporting", + "hexf-parse", + "indexmap 2.6.0", + "log", + "petgraph", + "rustc-hash", + "spirv", + "termcolor", + "thiserror", + "unicode-xid", +] + +[[package]] +name = "naive" +version = "0.1.0" +dependencies = [ + "settings", + "spirv-std", +] + +[[package]] +name = "ndk-sys" +version = "0.5.0+25.2.9519653" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c196769dd60fd4f363e11d948139556a344e79d451aeb2fa2fd040738ef7691" +dependencies = [ + "jni-sys", +] + +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "objc" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1" +dependencies = [ + "malloc_buf", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap 2.6.0", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "presser" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8cf8e6a8aa66ce33f63993ffc4ea4271eb5b0530a9002db8455ea6050c77bfa" + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "profiling" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afbdc74edc00b6f6a218ca6a5364d6226a259d4b8ea1af4a0ea063f27e179f4d" + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "range-alloc" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8a99fddc9f0ba0a85884b8d14e3592853e787d581ca1816c91349b10e4eeab" + +[[package]] +name = "raw-string" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0501e134c6905fee1f10fed25b0a7e1261bf676cffac9543a7d0730dec01af2" + +[[package]] +name = "raw-window-handle" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20675572f6f24e9e76ef639bc5552774ed45f1c30e2951e1e99c59888861c539" + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "renderdoc-sys" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b30a45b0cd0bcca8037f3d0dc3421eaf95327a17cad11964fb8179b4fc4832" + +[[package]] +name = "rspirv" +version = "0.12.0+sdk-1.3.268.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cf3a93856b6e5946537278df0d3075596371b1950ccff012f02b0f7eafec8d" +dependencies = [ + "rustc-hash", + "spirv", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_codegen_spirv" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "ar", + "either", + "hashbrown 0.11.2", + "indexmap 1.9.3", + "itertools", + "lazy_static", + "libc", + "num-traits", + "once_cell", + "regex", + "rspirv", + "rustc-demangle", + "rustc_codegen_spirv-types", + "sanitize-filename", + "smallvec", + "spirt", + "spirv-tools", + "syn 1.0.109", +] + +[[package]] +name = "rustc_codegen_spirv-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "rspirv", + "serde", + "serde_json", +] + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "sanitize-filename" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c502bdb638f1396509467cb0580ef3b29aa2a45c5d43e5d84928241280296c" +dependencies = [ + "lazy_static", + "regex", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + +[[package]] +name = "serde" +version = "1.0.215" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.215" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "serde_json" +version = "1.0.133" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "settings" +version = "0.1.0" +dependencies = [ + "bytemuck", + "glam", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "slotmap" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbff4acf519f630b3a3ddcfaea6c06b42174d9a44bc70c620e9ed1649d58b82a" +dependencies = [ + "version_check", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +dependencies = [ + "serde", +] + +[[package]] +name = "spirt" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2d5968bd2a36466468aac637b355776f080edfb0c6f769b2b99b9708260c42a" +dependencies = [ + "arrayvec", + "bytemuck", + "derive_more", + "elsa", + "indexmap 2.6.0", + "internal-iterator", + "itertools", + "lazy_static", + "longest-increasing-subsequence", + "rustc-hash", + "serde", + "serde_json", + "smallvec", +] + +[[package]] +name = "spirv" +version = "0.3.0+sdk-1.3.268.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eda41003dc44290527a59b13432d4a0379379fa074b70174882adfbdfd917844" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "spirv-builder" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "memchr", + "raw-string", + "rustc_codegen_spirv", + "rustc_codegen_spirv-types", + "serde", + "serde_json", +] + +[[package]] +name = "spirv-std" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "bitflags 1.3.2", + "glam", + "num-traits", + "spirv-std-macros", + "spirv-std-types", +] + +[[package]] +name = "spirv-std-macros" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "proc-macro2", + "quote", + "spirv-std-types", + "syn 1.0.109", +] + +[[package]] +name = "spirv-std-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" + +[[package]] +name = "spirv-tools" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcb3b0832881834994b7ec82b709ec5491043ceb4bf8101e27da6b5234b24261" +dependencies = [ + "spirv-tools-sys", +] + +[[package]] +name = "spirv-tools-sys" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e68b55a97aa6856e010a6f2477425875a97873e147bb0232160e73c45bdae7" +dependencies = [ + "cc", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "tiling_1d" +version = "0.1.0" +dependencies = [ + "settings", + "spirv-std", +] + +[[package]] +name = "tiling_2d_simd" +version = "0.1.0" +dependencies = [ + "settings", + "spirv-std", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.87", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" + +[[package]] +name = "web-sys" +version = "0.3.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wgpu" +version = "23.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ab52f2d3d18b70d5ab8dd270a1cff3ebe6dbe4a7d13c1cc2557138a9777fdc" +dependencies = [ + "arrayvec", + "cfg_aliases", + "document-features", + "js-sys", + "log", + "naga", + "parking_lot", + "profiling", + "raw-window-handle", + "smallvec", + "static_assertions", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "wgpu-core", + "wgpu-hal", + "wgpu-types", +] + +[[package]] +name = "wgpu-core" +version = "23.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e0c68e7b6322a03ee5b83fcd92caeac5c2a932f6457818179f4652ad2a9c065" +dependencies = [ + "arrayvec", + "bit-vec", + "bitflags 2.6.0", + "bytemuck", + "cfg_aliases", + "document-features", + "indexmap 2.6.0", + "log", + "naga", + "once_cell", + "parking_lot", + "profiling", + "raw-window-handle", + "rustc-hash", + "smallvec", + "thiserror", + "wgpu-hal", + "wgpu-types", +] + +[[package]] +name = "wgpu-hal" +version = "23.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de6e7266b869de56c7e3ed72a954899f71d14fec6cc81c102b7530b92947601b" +dependencies = [ + "android_system_properties", + "arrayvec", + "ash 0.38.0+1.3.281", + "bit-set", + "bitflags 2.6.0", + "block", + "bytemuck", + "cfg_aliases", + "core-graphics-types", + "glow", + "glutin_wgl_sys", + "gpu-alloc", + "gpu-allocator", + "gpu-descriptor", + "js-sys", + "khronos-egl", + "libc", + "libloading 0.8.5", + "log", + "metal", + "naga", + "ndk-sys", + "objc", + "once_cell", + "parking_lot", + "profiling", + "range-alloc", + "raw-window-handle", + "renderdoc-sys", + "rustc-hash", + "smallvec", + "thiserror", + "wasm-bindgen", + "web-sys", + "wgpu-types", + "windows", + "windows-core", +] + +[[package]] +name = "wgpu-types" +version = "23.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "610f6ff27778148c31093f3b03abc4840f9636d58d597ca2f5977433acfe0068" +dependencies = [ + "bitflags 2.6.0", + "js-sys", + "web-sys", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6" +dependencies = [ + "windows-core", + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba6d44ec8c2591c134257ce647b7ea6b20335bf6379a27dac5f1641fcf59f99" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-strings", + "windows-targets", +] + +[[package]] +name = "windows-implement" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "windows-interface" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "workgroup_256" +version = "0.1.0" +dependencies = [ + "settings", + "spirv-std", +] + +[[package]] +name = "workgroup_2d" +version = "0.1.0" +dependencies = [ + "settings", + "spirv-std", +] + +[[package]] +name = "xml-rs" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af310deaae937e48a26602b730250b4949e125f468f11e6990be3e5304ddd96f" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml new file mode 100644 index 0000000..115f79b --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml @@ -0,0 +1,66 @@ +[workspace] +members = [ + # + # ---- The rust code that runs on the GPU. ---- + # Often called "shaders" (in graphics) or "kernels" (in compute). + # + "crates/gpu/naive", + "crates/gpu/workgroup_256", + "crates/gpu/workgroup_2d", + "crates/gpu/tiling_1d", + "crates/gpu/tiling_2d_simd", + # + # ---- The rust code that runs both on the GPU and the CPU. ---- + # It "knows" what platform it is being compiled for and can conditionally change + # logic and dependencies using standard rust idioms. + # + # 1) Shared constants and settins used by both the CPU and GPU. + "crates/shared/settings", + # 2) An example of a program that can run unmodified on both the CPU and the GPU. + "crates/shared/isomorphic", + # + # ---- The rust code that runs on the CPU. ---- + # + # 1) The CPU library that contains the matrix multiplication implementation. It + # loads the compiled GPU program, sends it to the GPU, pushes data to the GPU, + # tells the GPU to execute, then reads the results back. + "crates/cpu/matmul", + # 2) The compiled GPU program that the CPU loads and sends to the GPU to execute. + "crates/cpu/compiled_for_gpu/naive", + "crates/cpu/compiled_for_gpu/workgroup_256", + "crates/cpu/compiled_for_gpu/workgroup_2d", + "crates/cpu/compiled_for_gpu/tiling_1d", + "crates/cpu/compiled_for_gpu/tiling_2d_simd", + "crates/cpu/compiled_for_gpu/isomorphic", + # 3) A binary that runs on the CPU. It configures the `matmul` library on the CPU + # and then tells it to run the matrix multiplication. + "bin/blog", + # 4) A binary that runs on the CPU. It compiles other libraries that run on the CPU + # and benchmarks them. + "benches" +] +resolver = "2" + +[workspace.dependencies] +spirv-std = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } +futures = "0.3" +glam = { version = "0.29.2", features = ["cuda", "bytemuck"] } +tracing = "0.1.40" + +# Enable incremental by default in release mode. +[profile.release] +incremental = true +# This is the default but without explicitly specifying it, Cargo +# will treat the identical settings in `[profile.release.build-override]` below +# as different sets of `rustc` flags and will not reuse artifacts between them. +codegen-units = 256 + +# Compile build-dependencies in release mode with the same settings as regular +# dependencies (including the incremental enabled above). +# +# We need this to templorarily work around +# https://github.com/Rust-GPU/rust-gpu/issues/29. +[profile.release.build-override] +opt-level = 3 +incremental = true +codegen-units = 256 diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/README.md b/blog/2024-11-21-optimizing-matrix-mul/code/README.md new file mode 100644 index 0000000..1ac1da3 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/README.md @@ -0,0 +1,21 @@ +The Rust code that accompanies the blog post. + + + +You will find: + +1. A binary (`blog`) that you can run with `cargo run` +2. Benchmarks that you can run with `cargo bench` +3. GPU shaders/kernels written in Rust +4. CPU code that takes the shaders and runs it on the GPU (via `wgpu`) or the CPU with a + simulated harness +5. Some tests that you can run with `cargo test` + +A good place to start to get the lay of the land is the workspace's `Cargo.toml` in this +directory. + +**Any changes to these files should ensure that the blog post is still correct as it +uses line numbers to embed code snippets.** + +Note: Everything needs to be run with `--release` to work around +https://github.com/Rust-GPU/rust-gpu/issues/29. diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/benches/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/benches/Cargo.toml new file mode 100644 index 0000000..23dd9e7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/benches/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "benches" +version = "0.1.0" +edition = "2021" + +[dependencies] +matmul = { path = "../crates/cpu/matmul" } +criterion = { version = "0.5.1", features = ["html_reports"] } +rand = "0.8" +futures.workspace = true + +[[bench]] +name = "gpu" +harness = false +path = "gpu_bench.rs" + +[[bench]] +name = "isomorphic" +harness = false +path = "isomorphic_bench.rs" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs b/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs new file mode 100644 index 0000000..31f1998 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs @@ -0,0 +1,171 @@ +use criterion::{ + black_box, criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput, +}; +use matmul::MatrixMultiply; +use rand::Rng; +use std::time::Duration; + +const WARMUP_TIME: Duration = Duration::from_secs(2); +const SAMPLE_SIZE: usize = 10; + +/// Matrix sizes to benchmark +const SIZES: &[(u32, u32, u32)] = &[ + // Square matrices + (2, 2, 2), + (4, 4, 4), + (16, 16, 16), + (64, 64, 64), + (128, 128, 128), + (256, 256, 256), + (512, 512, 512), + (1024, 1024, 1024), + (2048, 2048, 2048), + (4096, 4096, 4096), + // Non-square matrices + (4, 2, 8), // A: 4x2, B: 2x8, Result: 4x8 + (8, 4, 2), // A: 8x4, B: 4x2, Result: 8x2 + (16, 8, 32), // A: 16x8, B: 8x32, Result: 16x32 + (32, 16, 8), // A: 32x16, B: 16x8, Result: 32x8 + (64, 32, 128), // A: 64x32, B: 32x128, Result: 64x128 + (1024, 512, 2048), // A: 1024x512, B: 512x2048, Result: 1024x2048 + (2048, 1024, 4096), // A: 2048x1024, B: 1024x4096, Result: 2048x4096 +]; + +fn bench_all_variants(c: &mut Criterion) { + // Initialize all variants outside the loop + let multiplier_naive = matmul::naive::wgpu(); + let multiplier_workgroup_256 = matmul::workgroup_256::wgpu(); + let multiplier_workgroup_2d = matmul::workgroup_2d::wgpu(); + let multiplier_tiling_1d = matmul::tiling_1d::wgpu(); + let multiplier_tiling_2d_simd = matmul::tiling_2d_simd::wgpu(); + let multiplier_isomorphic_gpu = matmul::isomorphic::wgpu(); + + for &(m, k, n) in SIZES { + // Calculate FLOPs for this size + let flops = 2.0 * (m as f64 * n as f64 * k as f64); + let mut group = c.benchmark_group(format!("matmul{}x{}x{}", m, k, n)); + group.sampling_mode(SamplingMode::Auto); + group.warm_up_time(WARMUP_TIME); + group.sample_size(SAMPLE_SIZE); + group.throughput(Throughput::Elements(flops as u64)); + + // Create matrices for the given size + let (a, b) = create_test_matrices(m, k, n); + + // Benchmark each variant within the same size group + + if m <= 128 && n <= 128 { + group.bench_with_input( + BenchmarkId::new("naive:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_naive.multiply(black_box(&a), black_box(&b), m, k, n)) + }); + }, + ); + } + + group.bench_with_input( + BenchmarkId::new("workgroup_256:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_workgroup_256.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("workgroup_2d:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_workgroup_2d.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("tiling_1d:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_tiling_1d.multiply(black_box(&a), black_box(&b), m, k, n)) + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("tiling_2d_simd:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_tiling_2d_simd.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("isomorphic:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_isomorphic_gpu.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + } +} + +criterion_group! { + name = gpu; + config = Criterion::default() + .with_plots() + .significance_level(0.01) + .noise_threshold(0.02); + targets = bench_all_variants +} + +criterion_main!(gpu); + +pub fn validate_dimensions(a_dims: (u32, u32), b_dims: (u32, u32)) -> bool { + a_dims.1 == b_dims.0 +} + +fn generate_random_matrix(rows: u32, cols: u32) -> Vec { + let mut rng = rand::thread_rng(); + (0..rows * cols).map(|_| rng.gen::()).collect() +} + +fn create_test_matrices(m: u32, k: u32, n: u32) -> (Vec, Vec) { + assert!( + validate_dimensions((m, k), (k, n)), + "Invalid matrix dimensions" + ); + (generate_random_matrix(m, k), generate_random_matrix(k, n)) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/benches/isomorphic_bench.rs b/blog/2024-11-21-optimizing-matrix-mul/code/benches/isomorphic_bench.rs new file mode 100644 index 0000000..66740ed --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/benches/isomorphic_bench.rs @@ -0,0 +1,133 @@ +use criterion::{ + black_box, criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput, +}; +use matmul::MatrixMultiply; +use rand::Rng; +use std::time::Duration; + +const WARMUP_TIME: Duration = Duration::from_secs(2); +const MEASUREMENT_TIME: Duration = Duration::from_secs(5 * 60); +const SAMPLE_SIZE: usize = 10; + +/// Matrix sizes to benchmark +const SIZES: &[(u32, u32, u32)] = &[ + // Square matrices + (2, 2, 2), + (4, 4, 4), + (16, 16, 16), + (64, 64, 64), + (128, 128, 128), + (256, 256, 256), + (512, 512, 512), + (1024, 1024, 1024), + (2048, 2048, 2048), + (4096, 4096, 4096), + // Non-square matrices + (4, 2, 8), // A: 4x2, B: 2x8, Result: 4x8 + (8, 4, 2), // A: 8x4, B: 4x2, Result: 8x2 + (16, 8, 32), // A: 16x8, B: 8x32, Result: 16x32 + (32, 16, 8), // A: 32x16, B: 16x8, Result: 32x8 + (64, 32, 128), // A: 64x32, B: 32x128, Result: 64x128 + (1024, 512, 2048), // A: 1024x512, B: 512x2048, Result: 1024x2048 + (2048, 1024, 4096), // A: 2048x1024, B: 1024x4096, Result: 2048x4096 +]; + +fn bench_isomorphic_variants(c: &mut Criterion) { + // Initialize isomorphic variants + let multiplier_isomorphic_gpu = matmul::isomorphic::wgpu(); + let multiplier_isomorphic_cpu_single = matmul::isomorphic::cpu::single_threaded(); + let multiplier_isomorphic_cpu_multi = matmul::isomorphic::cpu::multi_threaded(); + + for &(m, k, n) in SIZES { + // Calculate FLOPs for this size + let flops = 2.0 * (m as f64 * n as f64 * k as f64); + + let mut group = c.benchmark_group(format!("isomorphic_matmul{}x{}x{}", m, k, n)); + group.sampling_mode(SamplingMode::Flat); + group.warm_up_time(WARMUP_TIME); + //group.measurement_time(MEASUREMENT_TIME); + group.sample_size(SAMPLE_SIZE); + group.throughput(Throughput::Elements(flops as u64)); + + // Create matrices for the given size + let (a, b) = create_test_matrices(m, k, n); + + group.bench_with_input( + BenchmarkId::new("isomorphic:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_isomorphic_gpu.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("isomorphic:cpu:single", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_isomorphic_cpu_single.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("isomorphic:cpu:multi", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_isomorphic_cpu_multi.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + + group.finish(); + } +} + +criterion_group! { + name = isomorphic; + config = Criterion::default() + .with_plots() + .significance_level(0.01) + .noise_threshold(0.02); + targets = bench_isomorphic_variants +} + +criterion_main!(isomorphic); + +pub fn validate_dimensions(a_dims: (u32, u32), b_dims: (u32, u32)) -> bool { + a_dims.1 == b_dims.0 +} + +fn generate_random_matrix(rows: u32, cols: u32) -> Vec { + let mut rng = rand::thread_rng(); + (0..rows * cols).map(|_| rng.gen::()).collect() +} + +fn create_test_matrices(m: u32, k: u32, n: u32) -> (Vec, Vec) { + assert!( + validate_dimensions((m, k), (k, n)), + "Invalid matrix dimensions" + ); + (generate_random_matrix(m, k), generate_random_matrix(k, n)) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/Cargo.toml new file mode 100644 index 0000000..0d72391 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "blog" +version = "0.1.0" +edition = "2021" + +[[bin]] +name = "blog" +path = "src/bin.rs" + +[dependencies] +matmul = { path = "../../crates/cpu/matmul" } +settings = { path = "../../crates/shared/settings" } +futures.workspace = true +tracing.workspace = true +tracing-subscriber = { version = "0.3.18", features = ["env-filter", "std"] } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs new file mode 100644 index 0000000..57850d0 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs @@ -0,0 +1,107 @@ +use matmul::MatrixMultiply; +use std::fmt::Display; +use std::time::Instant; +use tracing::{debug, info, instrument, span, trace, Level}; +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; + +fn main() { + tracing_subscriber::registry() + .with(fmt::Layer::default()) + .with(EnvFilter::from_default_env()) + .init(); + + let sizes = [ + // Square matrices + (2, 2, 2), + (4, 4, 4), + (8, 8, 8), + (16, 16, 16), + (32, 32, 32), + (64, 64, 64), + (128, 128, 128), + // Non-square matrices + (4, 2, 8), // A: 4x2, B: 2x8, Result: 4x8 + (8, 4, 2), // A: 8x4, B: 4x2, Result: 8x2 + (16, 8, 32), // A: 16x8, B: 8x32, Result: 16x32 + (32, 16, 8), // A: 32x16, B: 16x8, Result: 32x8 + (64, 32, 128), // A: 64x32, B: 32x128, Result: 64x128 + ]; + + run_tests(matmul::naive::wgpu(), &sizes); + run_tests(matmul::workgroup_256::wgpu(), &sizes); + run_tests(matmul::workgroup_2d::wgpu(), &sizes); + //run_tests(matmul::tiling_1d::wgpu(), &sizes); + run_tests(matmul::tiling_2d_simd::wgpu(), &sizes); + + run_tests(matmul::isomorphic::wgpu(), &sizes); + run_tests(matmul::isomorphic::cpu::single_threaded(), &sizes); + run_tests(matmul::isomorphic::cpu::multi_threaded(), &sizes); +} + +#[instrument(skip(multiplier, sizes), fields(algorithm = %multiplier))] +fn run_tests>(multiplier: U, sizes: &[(u32, u32, u32)]) { + debug!(algorithm = %multiplier, "Starting tests"); + + for &(m, k, n) in sizes { + let span = tracing::span!(Level::INFO, "matrix_test", algorithm = %multiplier, m, k, n); + let _enter = span.enter(); + + info!("Testing size: {}x{}x{}", m, k, n); + + // Setup phase + let setup_span = span!(Level::INFO, "setup_phase"); + let _setup_enter = setup_span.enter(); + let a: Vec = (0..m * k).map(|i| i as f32).collect(); + let b: Vec = (0..k * n).map(|i| i as f32).collect(); + drop(_setup_enter); + + // Compute phase + let compute_span = span!(Level::INFO, "compute_phase"); + let compute_start = Instant::now(); + let _compute_enter = compute_span.enter(); + let result = multiplier.multiply(&a, &b, m, k, n); + let compute_time = compute_start.elapsed(); + drop(_compute_enter); + + // Calculate GFLOPS + let gflop_span = span!(Level::INFO, "calculate_gflops"); + let _gflop_enter = gflop_span.enter(); + let ops = 2.0 * (m * n * k) as f64; + let flops = ops / compute_time.as_secs_f64() / 1e9; + info!("Flops: {}", flops); + drop(_gflop_enter); + + // Verification phase + let verify_span = span!(Level::INFO, "verification_phase"); + let _verify_enter = verify_span.enter(); + verify_results(&a, &b, &result, m, k, n); + drop(_verify_enter); + } +} + +#[instrument(skip(a, b, result), fields(rows = m, cols = n))] +fn verify_results(a: &[f32], b: &[f32], result: &[f32], m: u32, k: u32, n: u32) { + let verify_rows = std::cmp::min(m, 2); + let verify_cols = std::cmp::min(n, 2); + + for i in 0..verify_rows { + for j in 0..verify_cols { + let mut expected = 0.0; + for x in 0..k { + expected += a[(i * k + x) as usize] * b[(x * n + j) as usize]; + } + let actual = result[(i * n + j) as usize]; + let diff = (actual - expected).abs(); + assert!( + diff < 1e-3, + "Mismatch at [{}, {}]: expected {}, got {}", + i, + j, + expected, + actual + ); + } + } + + trace!("Verification passed"); +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/Cargo.toml new file mode 100644 index 0000000..1eb2ceb --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compiled_isomorphic" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[build-dependencies] +spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/build.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/build.rs new file mode 100644 index 0000000..4d016e8 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/build.rs @@ -0,0 +1,37 @@ +use spirv_builder::{MetadataPrintout, SpirvBuilder}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn main() -> Result<(), Box> { + let gpu_crate_path = Path::new("../../../gpu/tiling_2d_simd"); + + // Compile the shader crate with SpirvBuilder. + let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2") + .print_metadata(MetadataPrintout::Full) + .build()?; + + // Get the compiled shader as a PathBuf and read its binary content. + let shader_path = result.module.unwrap_single(); + let shader_binary = fs::read(&shader_path)?; + + // Generate Rust code with a constant holding the shader binary content. + let shader_binary_literal = shader_binary + .iter() + .map(|byte| format!("0x{:02X}", byte)) + .collect::>() + .join(", "); + let generated_code = format!( + "/// Compiled SPIR-V shader binary\n\ + pub const SHADER_BINARY: &[u8] = &[{}];", + shader_binary_literal + ); + + // Write this generated code to `OUT_DIR` as `shader_binary.rs`. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let shader_binary_rs = out_dir.join("shader_binary.rs"); + fs::write(&shader_binary_rs, generated_code)?; + + println!("Generated shader binary constant at {:?}", shader_binary_rs); + Ok(()) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/src/lib.rs new file mode 100644 index 0000000..c4ca963 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/isomorphic/src/lib.rs @@ -0,0 +1,4 @@ +// Including the raw bytes generated shader binary in our rust code. This "bloats" the +// binary, but it also means you don't have to worry about the shader file being +// misplaced or deleted. +include!(concat!(env!("OUT_DIR"), "/shader_binary.rs")); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/Cargo.toml new file mode 100644 index 0000000..2620079 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compiled_naive" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[build-dependencies] +spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/build.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/build.rs new file mode 100644 index 0000000..9350a1f --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/build.rs @@ -0,0 +1,37 @@ +use spirv_builder::{MetadataPrintout, SpirvBuilder}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn main() -> Result<(), Box> { + let gpu_crate_path = Path::new("../../../gpu/naive"); + + // Compile the shader crate with SpirvBuilder. + let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2") + .print_metadata(MetadataPrintout::Full) + .build()?; + + // Get the compiled shader as a PathBuf and read its binary content. + let shader_path = result.module.unwrap_single(); + let shader_binary = fs::read(&shader_path)?; + + // Generate Rust code with a constant holding the shader binary content. + let shader_binary_literal = shader_binary + .iter() + .map(|byte| format!("0x{:02X}", byte)) + .collect::>() + .join(", "); + let generated_code = format!( + "/// Compiled SPIR-V shader binary\n\ + pub const SHADER_BINARY: &[u8] = &[{}];", + shader_binary_literal + ); + + // Write this generated code to `OUT_DIR` as `shader_binary.rs`. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let shader_binary_rs = out_dir.join("shader_binary.rs"); + fs::write(&shader_binary_rs, generated_code)?; + + println!("Generated shader binary constant at {:?}", shader_binary_rs); + Ok(()) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/src/lib.rs new file mode 100644 index 0000000..c4ca963 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/naive/src/lib.rs @@ -0,0 +1,4 @@ +// Including the raw bytes generated shader binary in our rust code. This "bloats" the +// binary, but it also means you don't have to worry about the shader file being +// misplaced or deleted. +include!(concat!(env!("OUT_DIR"), "/shader_binary.rs")); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/Cargo.toml new file mode 100644 index 0000000..b8af28f --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compiled_tiling_1d" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[build-dependencies] +spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/build.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/build.rs new file mode 100644 index 0000000..6912537 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/build.rs @@ -0,0 +1,37 @@ +use spirv_builder::{MetadataPrintout, SpirvBuilder}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn main() -> Result<(), Box> { + let gpu_crate_path = Path::new("../../../gpu/tiling_1d"); + + // Compile the shader crate with SpirvBuilder. + let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2") + .print_metadata(MetadataPrintout::Full) + .build()?; + + // Get the compiled shader as a PathBuf and read its binary content. + let shader_path = result.module.unwrap_single(); + let shader_binary = fs::read(&shader_path)?; + + // Generate Rust code with a constant holding the shader binary content. + let shader_binary_literal = shader_binary + .iter() + .map(|byte| format!("0x{:02X}", byte)) + .collect::>() + .join(", "); + let generated_code = format!( + "/// Compiled SPIR-V shader binary\n\ + pub const SHADER_BINARY: &[u8] = &[{}];", + shader_binary_literal + ); + + // Write this generated code to `OUT_DIR` as `shader_binary.rs`. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let shader_binary_rs = out_dir.join("shader_binary.rs"); + fs::write(&shader_binary_rs, generated_code)?; + + println!("Generated shader binary constant at {:?}", shader_binary_rs); + Ok(()) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/src/lib.rs new file mode 100644 index 0000000..c4ca963 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d/src/lib.rs @@ -0,0 +1,4 @@ +// Including the raw bytes generated shader binary in our rust code. This "bloats" the +// binary, but it also means you don't have to worry about the shader file being +// misplaced or deleted. +include!(concat!(env!("OUT_DIR"), "/shader_binary.rs")); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/Cargo.toml new file mode 100644 index 0000000..12809ac --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compiled_tiling_2d_simd" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[build-dependencies] +spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/build.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/build.rs new file mode 100644 index 0000000..4d016e8 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/build.rs @@ -0,0 +1,37 @@ +use spirv_builder::{MetadataPrintout, SpirvBuilder}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn main() -> Result<(), Box> { + let gpu_crate_path = Path::new("../../../gpu/tiling_2d_simd"); + + // Compile the shader crate with SpirvBuilder. + let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2") + .print_metadata(MetadataPrintout::Full) + .build()?; + + // Get the compiled shader as a PathBuf and read its binary content. + let shader_path = result.module.unwrap_single(); + let shader_binary = fs::read(&shader_path)?; + + // Generate Rust code with a constant holding the shader binary content. + let shader_binary_literal = shader_binary + .iter() + .map(|byte| format!("0x{:02X}", byte)) + .collect::>() + .join(", "); + let generated_code = format!( + "/// Compiled SPIR-V shader binary\n\ + pub const SHADER_BINARY: &[u8] = &[{}];", + shader_binary_literal + ); + + // Write this generated code to `OUT_DIR` as `shader_binary.rs`. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let shader_binary_rs = out_dir.join("shader_binary.rs"); + fs::write(&shader_binary_rs, generated_code)?; + + println!("Generated shader binary constant at {:?}", shader_binary_rs); + Ok(()) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/src/lib.rs new file mode 100644 index 0000000..c4ca963 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_2d_simd/src/lib.rs @@ -0,0 +1,4 @@ +// Including the raw bytes generated shader binary in our rust code. This "bloats" the +// binary, but it also means you don't have to worry about the shader file being +// misplaced or deleted. +include!(concat!(env!("OUT_DIR"), "/shader_binary.rs")); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/Cargo.toml new file mode 100644 index 0000000..4354037 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compiled_workgroup_256" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[build-dependencies] +spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/build.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/build.rs new file mode 100644 index 0000000..ed09395 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/build.rs @@ -0,0 +1,37 @@ +use spirv_builder::{MetadataPrintout, SpirvBuilder}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn main() -> Result<(), Box> { + let gpu_crate_path = Path::new("../../../gpu/workgroup_256"); + + // Compile the shader crate with SpirvBuilder. + let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2") + .print_metadata(MetadataPrintout::Full) + .build()?; + + // Get the compiled shader as a PathBuf and read its binary content. + let shader_path = result.module.unwrap_single(); + let shader_binary = fs::read(&shader_path)?; + + // Generate Rust code with a constant holding the shader binary content. + let shader_binary_literal = shader_binary + .iter() + .map(|byte| format!("0x{:02X}", byte)) + .collect::>() + .join(", "); + let generated_code = format!( + "/// Compiled SPIR-V shader binary\n\ + pub const SHADER_BINARY: &[u8] = &[{}];", + shader_binary_literal + ); + + // Write this generated code to `OUT_DIR` as `shader_binary.rs`. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let shader_binary_rs = out_dir.join("shader_binary.rs"); + fs::write(&shader_binary_rs, generated_code)?; + + println!("Generated shader binary constant at {:?}", shader_binary_rs); + Ok(()) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/src/lib.rs new file mode 100644 index 0000000..c4ca963 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_256/src/lib.rs @@ -0,0 +1,4 @@ +// Including the raw bytes generated shader binary in our rust code. This "bloats" the +// binary, but it also means you don't have to worry about the shader file being +// misplaced or deleted. +include!(concat!(env!("OUT_DIR"), "/shader_binary.rs")); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/Cargo.toml new file mode 100644 index 0000000..4e44998 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compiled_workgroup_2d" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[build-dependencies] +spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/build.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/build.rs new file mode 100644 index 0000000..ea582b7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/build.rs @@ -0,0 +1,37 @@ +use spirv_builder::{MetadataPrintout, SpirvBuilder}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn main() -> Result<(), Box> { + let gpu_crate_path = Path::new("../../../gpu/workgroup_2d"); + + // Compile the shader crate with SpirvBuilder. + let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2") + .print_metadata(MetadataPrintout::Full) + .build()?; + + // Get the compiled shader as a PathBuf and read its binary content. + let shader_path = result.module.unwrap_single(); + let shader_binary = fs::read(&shader_path)?; + + // Generate Rust code with a constant holding the shader binary content. + let shader_binary_literal = shader_binary + .iter() + .map(|byte| format!("0x{:02X}", byte)) + .collect::>() + .join(", "); + let generated_code = format!( + "/// Compiled SPIR-V shader binary\n\ + pub const SHADER_BINARY: &[u8] = &[{}];", + shader_binary_literal + ); + + // Write this generated code to `OUT_DIR` as `shader_binary.rs`. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let shader_binary_rs = out_dir.join("shader_binary.rs"); + fs::write(&shader_binary_rs, generated_code)?; + + println!("Generated shader binary constant at {:?}", shader_binary_rs); + Ok(()) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/src/lib.rs new file mode 100644 index 0000000..c4ca963 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/workgroup_2d/src/lib.rs @@ -0,0 +1,4 @@ +// Including the raw bytes generated shader binary in our rust code. This "bloats" the +// binary, but it also means you don't have to worry about the shader file being +// misplaced or deleted. +include!(concat!(env!("OUT_DIR"), "/shader_binary.rs")); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml new file mode 100644 index 0000000..be36071 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "matmul" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[dependencies] +settings = { path = "../../shared/settings" } +bytemuck = { version = "1.9", features = ["derive"] } +wgpu = { version = "23.0", features = ["spirv"] } +ash = { version = "0.37" } +rayon = "1.10" +futures.workspace = true +tracing.workspace = true +glam.workspace = true + +# The following dependencies are used to link to the compiled shaders. +compiled_naive = { path = "../compiled_for_gpu/naive" } +compiled_workgroup_256 = { path = "../compiled_for_gpu/workgroup_256" } +compiled_workgroup_2d = { path = "../compiled_for_gpu/workgroup_2d" } +compiled_tiling_1d = { path = "../compiled_for_gpu/tiling_1d" } +compiled_tiling_2d_simd = { path = "../compiled_for_gpu/tiling_2d_simd" } +compiled_isomorphic = { path = "../compiled_for_gpu/isomorphic" } +# The CPU side of the isomophic implementation. +isomorphic = { path = "../../shared/isomorphic" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/cpu.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/cpu.rs new file mode 100644 index 0000000..80a2e23 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/cpu.rs @@ -0,0 +1,192 @@ +use crate::{Cpu, GridComputation, MatrixMultiply}; +use glam::UVec3; +use rayon::prelude::*; +use settings::Dimensions; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::future::Future; +use std::sync::Mutex; + +/// Run matrix multiplication on the CPU with a single thread. +pub struct SingleThreadedMatMul { + variant: T, +} + +impl Display for SingleThreadedMatMul { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{} (cpu, single)", self.variant) + } +} + +impl MatrixMultiply for SingleThreadedMatMul +where + T: Cpu + GridComputation + Display + Send + Sync, +{ + fn new(variant: T) -> impl Future + Send { + async move { SingleThreadedMatMul { variant } } + } + + fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec { + // Initialize the result vector with zeros as that is what the GPU does. + let mut result = vec![0.0; (m * n) as usize]; + + // Retrieve workgroup and dispatch configurations. These tell us how to iterate. + let workgroup = ::workgroup(&self.variant); + let dispatch = ::dispatch_count(&self.variant, m, n); + + // Define dimensions as (m, k, n) + let dimensions = Dimensions::new(m, k, n); + + // Iterate over the dispatch grid + for gwx in 0..dispatch.x { + for gwy in 0..dispatch.y { + for wx in 0..workgroup.x { + for wy in 0..workgroup.y { + // Calculate global indices + let x = gwx * workgroup.x + wx; + let y = gwy * workgroup.y + wy; + + if x < m && y < n { + // Define global id + let global_id = UVec3::new(x, y, 1); + + // Perform the matmul operation for element (x, y). NOTE: + // This is the EXACT SAME CODE THAT RUNS ON THE GPU, RUNNING + // ON THE CPU. This is the power of rust-gpu. + ::call( + &self.variant, + global_id, + &dimensions, + &a, + &b, + &mut result, + ); + } + } + } + } + } + + result + } +} + +/// Run matrix multiplication on the CPU with multiple threads. +pub struct MultiThreadedMatMul { + variant: T, +} + +impl Display for MultiThreadedMatMul { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{} (cpu, multi)", self.variant) + } +} + +impl MatrixMultiply for MultiThreadedMatMul +where + T: Cpu + GridComputation + Display + Send + Sync, +{ + fn new(variant: T) -> impl Future + Send { + async move { MultiThreadedMatMul { variant } } + } + + fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec { + // Initialize the result vector with zeros + let result = vec![0.0; (m * n) as usize]; + let result = Mutex::new(result); + + // Retrieve workgroup and dispatch configurations. These tell us how to iterate. + let workgroup = ::workgroup(&self.variant); + let dispatch = ::dispatch_count(&self.variant, m, n); + + // Define dimensions as (m, k, n) + let dimensions = Dimensions::new(m, k, n); + + // Precompute all (x, y) indices that need to be processed + let tasks: Vec<(usize, usize)> = (0..dispatch.x) + .flat_map(|gwx| { + (0..dispatch.y).flat_map(move |gwy| { + (0..workgroup.x).flat_map(move |wx| { + (0..workgroup.y).filter_map(move |wy| { + let x = gwx * workgroup.x + wx; + let y = gwy * workgroup.y + wy; + if x < m && y < n { + Some((x as usize, y as usize)) + } else { + None + } + }) + }) + }) + }) + .collect(); + + // Process each (x, y) pair in parallel + tasks.par_iter().for_each(|&(x, y)| { + // Define global_id (adjust z if necessary) + let global_id = UVec3::new(x as u32, y as u32, 0); // Changed z to 0 for consistency + + // Lock the mutex to get mutable access to the result vector + let mut result_lock = result.lock().unwrap(); + + // Perform the matmul operation for element (x, y) + ::call( + &self.variant, + global_id, + &dimensions, + &a, + &b, + &mut result_lock, + ); + }); + + // Extract the result vector from the Mutex + let result = Mutex::into_inner(result).unwrap(); + + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_single_threaded_matmul_2x1x1() { + let m = 2; + let k = 1; + let n = 1; + + let a = vec![1.0, 2.0]; + let b = vec![3.0]; + + let expected = vec![3.0, 6.0]; + + let variant = crate::variants::Isomorphic; + let matrix_multiplier = futures::executor::block_on(SingleThreadedMatMul::new(variant)); + + let result = matrix_multiplier.multiply(&a, &b, m, k, n); + + assert_eq!(result, expected); + } + + #[test] + fn test_multithreaded_matmul_2x1x1() { + let m = 2; + let k = 1; + let n = 1; + + let a = vec![1.0, 2.0]; + let b = vec![3.0]; + + let expected = vec![3.0, 6.0]; + + let variant = crate::variants::Isomorphic; + let matrix_multiplier = futures::executor::block_on(MultiThreadedMatMul::new(variant)); + + let result = matrix_multiplier.multiply(&a, &b, m, k, n); + + assert_eq!(result, expected); + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/mod.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/mod.rs new file mode 100644 index 0000000..b33c465 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/mod.rs @@ -0,0 +1,2 @@ +pub mod cpu; +pub mod wgpu; diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/wgpu.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/wgpu.rs new file mode 100644 index 0000000..30f1bde --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/wgpu.rs @@ -0,0 +1,403 @@ +use crate::{Gpu, GridComputation, MatrixMultiply}; +use bytemuck; +use futures::channel::oneshot; +use futures::executor::block_on; +use settings::{BufferLayout, Dimensions, SHADER_ENTRY_POINT}; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use tracing::trace; +use wgpu::{self, util::DeviceExt}; + +/// Struct responsible for performing matrix multiplication on the GPU. +pub struct MatrixMultiplier { + device: wgpu::Device, + queue: wgpu::Queue, + pipeline: wgpu::ComputePipeline, + bind_group_layout: wgpu::BindGroupLayout, + variant: T, +} + +impl Display for MatrixMultiplier { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{} (wgpu)", self.variant) + } +} + +impl MatrixMultiply for MatrixMultiplier +where + T: Gpu + GridComputation + Display + Send, +{ + /// Initializes a new `MatrixMultiplier` with necessary GPU resources. + async fn new(variant: T) -> Self { + // Set up WGPU to talk to the system's GPUs and manage rendering or compute tasks. + let instance = create_instance().await; + + // Find a GPU. + let adapter = request_adapter(&instance).await; + + // Get access to the GPU and its command system for sending tasks. + let (device, queue) = request_device_and_queue(&adapter).await; + + // Load the compiled code that we will run on the GPU. + let shader = create_shader_module(&device, ::compiled_shader(&variant)); + + // Define how the GPU will connect data and resources to the GPU program. + let bind_group_layout = create_bind_group_layout(&device); + + // Specify how the GPU pipeline organizes its resources and GPU programs. + let pipeline_layout = create_pipeline_layout(&device, &bind_group_layout); + + // Build the actual GPU pipeline to run the GPU program and manage execution. + let pipeline = create_compute_pipeline(&device, &pipeline_layout, &shader); + + Self { + device, + queue, + pipeline, + bind_group_layout, + variant, + } + } + + /// Executes matrix multiplication for given input matrices. + /// + /// Uploads the input matrices to the GPU, dispatches the compute shader, + /// and retrieves the result. + fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec { + trace!(?a, ?b, "Starting matrix multiplication"); + + let result_size = (m * n * std::mem::size_of::() as u32) as u64; + + // Create a memory buffer on the GPU to store matrix `a`, initialized with data + // copied from the CPU. + let a_buffer = create_buffer_init( + &self.device, + "Matrix A Buffer", + a, + wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, + ); + + // Create a memory buffer on the GPU to store matrix `b`, initialized with data + // copied from the CPU. + let b_buffer = create_buffer_init( + &self.device, + "Matrix B Buffer", + b, + wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, + ); + + // Allocate GPU memory for storing the result. + let result_buffer = create_buffer( + &self.device, + "Result Buffer", + result_size, + wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + ); + + // Create a memory buffer on the GPU to store the dimensions of the matrices, + // initialized with data copied from the CPU. + // + // This is a `uniform` buffer instead of `storage` buffer because the data is + // the same for all workgroups, it is read-only, and it is small enough to fit + // in a single buffer (`uniform` buffers are limited to to 64 KB on most GPUs + // and often less on older GPUs). + let dimensions = Dimensions::new(m, k, n); + let dimensions_buffer = create_buffer_init( + &self.device, + "Dimensions Buffer", + &[dimensions], + wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + ); + + // Group all related buffers for use in the compute pipeline. + let bind_group = create_bind_group( + &self.device, + &self.bind_group_layout, + &a_buffer, + &b_buffer, + &result_buffer, + &dimensions_buffer, + ); + + // Create a buffer to retrieve computation results back from the GPU. + let staging_buffer = create_staging_buffer(&self.device, result_size); + + // Set up commands to perform the computation on the GPU. + let mut encoder = self + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("Matrix Multiply Encoder"), + }); + + { + // Define the compute pass, specifying which GPU program to run and what + // buffers should be involved. + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("Matrix Multiply Compute Pass"), + timestamp_writes: Default::default(), + }); + + compute_pass.set_pipeline(&self.pipeline); + compute_pass.set_bind_group(0, &bind_group, &[]); + + // Dispatch workgroups to perform the matrix multiplication. + let dispatch_count = ::dispatch_count(&self.variant, m, n); + tracing::trace!("Dispatch counts: {:?}", dispatch_count); + compute_pass.dispatch_workgroups(dispatch_count.x, dispatch_count.y, dispatch_count.z); + } + + // Copy the GPU's result into a buffer for CPU access. + encoder.copy_buffer_to_buffer(&result_buffer, 0, &staging_buffer, 0, result_size); + self.queue.submit(Some(encoder.finish())); + + // Make the staging buffer's data available to the CPU. + let slice = staging_buffer.slice(..); + let (sender, receiver) = oneshot::channel(); + + slice.map_async(wgpu::MapMode::Read, move |result| { + let _ = sender.send(result); + }); + + self.device.poll(wgpu::Maintain::Wait); + + // Wait for the mapping to complete and verify success. + block_on(receiver) + .expect("Failed to receive data") + .expect("Map async failed"); + + // Read and convert the result data into a typed vector instead of raw bytes. + let data = slice.get_mapped_range(); + let result: Vec = bytemuck::cast_slice(&data).to_vec(); + drop(data); + staging_buffer.unmap(); + + trace!(?result, "Matrix multiplication result"); + result + } +} + +/// Creates a new WGPU instance with specified backends. +async fn create_instance() -> wgpu::Instance { + let backends = wgpu::util::backend_bits_from_env() + .unwrap_or(wgpu::Backends::VULKAN | wgpu::Backends::METAL); + wgpu::Instance::new(wgpu::InstanceDescriptor { + backends, + dx12_shader_compiler: wgpu::util::dx12_shader_compiler_from_env().unwrap_or_default(), + ..Default::default() + }) +} + +/// Requests a suitable GPU adapter based on the instance. +async fn request_adapter(instance: &wgpu::Instance) -> wgpu::Adapter { + instance + .request_adapter(&wgpu::RequestAdapterOptions { + power_preference: wgpu::PowerPreference::HighPerformance, + force_fallback_adapter: false, + compatible_surface: None, + }) + .await + .expect("Failed to find appropriate adapter") +} + +/// Requests the GPU device and queue from the adapter. +async fn request_device_and_queue(adapter: &wgpu::Adapter) -> (wgpu::Device, wgpu::Queue) { + adapter + .request_device( + &wgpu::DeviceDescriptor { + label: Some("Matrix Multiply Device"), + required_features: wgpu::Features::empty(), + required_limits: wgpu::Limits::default(), + memory_hints: wgpu::MemoryHints::default(), + }, + None, + ) + .await + .expect("Failed to create device") +} + +/// Compiles and creates the shader module from SPIR-V bytes. +fn create_shader_module(device: &wgpu::Device, spirv_bytes: &[u8]) -> wgpu::ShaderModule { + device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("SPIR-V Shader Module"), + source: wgpu::ShaderSource::SpirV(std::borrow::Cow::Borrowed(&pad_and_cast_spirv( + spirv_bytes, + ))), + }) +} + +/// Defines the bind group layout for the compute pipeline. +fn create_bind_group_layout(device: &wgpu::Device) -> wgpu::BindGroupLayout { + device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: Some("Matrix Multiply Bind Group Layout"), + entries: &[ + wgpu::BindGroupLayoutEntry { + binding: BufferLayout::DIMENSIONS.binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: BufferLayout::A_MATRIX.binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { + read_only: BufferLayout::A_MATRIX.readonly, + }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: BufferLayout::B_MATRIX.binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { + read_only: BufferLayout::B_MATRIX.readonly, + }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: BufferLayout::RESULT.binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { + read_only: BufferLayout::RESULT.readonly, + }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + ], + }) +} + +/// Sets up the pipeline layout using the bind group layout. +fn create_pipeline_layout( + device: &wgpu::Device, + bind_group_layout: &wgpu::BindGroupLayout, +) -> wgpu::PipelineLayout { + device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: Some("Matrix Multiply Pipeline Layout"), + bind_group_layouts: &[bind_group_layout], + push_constant_ranges: &[], + }) +} + +/// Creates the compute pipeline with the shader and pipeline layout. +fn create_compute_pipeline( + device: &wgpu::Device, + pipeline_layout: &wgpu::PipelineLayout, + shader: &wgpu::ShaderModule, +) -> wgpu::ComputePipeline { + device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some("Matrix Multiply Pipeline"), + layout: Some(pipeline_layout), + module: shader, + entry_point: Some(SHADER_ENTRY_POINT), + compilation_options: Default::default(), + cache: Default::default(), + }) +} + +/// Binds the allocated buffers to the shader's bindings. +fn create_bind_group( + device: &wgpu::Device, + layout: &wgpu::BindGroupLayout, + a_buffer: &wgpu::Buffer, + b_buffer: &wgpu::Buffer, + result_buffer: &wgpu::Buffer, + dimensions_buffer: &wgpu::Buffer, +) -> wgpu::BindGroup { + device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("Matrix Multiply Bind Group"), + layout, + entries: &[ + wgpu::BindGroupEntry { + binding: BufferLayout::A_MATRIX.binding, + resource: a_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: BufferLayout::B_MATRIX.binding, + resource: b_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: BufferLayout::RESULT.binding, + resource: result_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: BufferLayout::DIMENSIONS.binding, + resource: dimensions_buffer.as_entire_binding(), + }, + ], + }) +} + +/// Creates a staging buffer for reading results back to the CPU. +fn create_staging_buffer(device: &wgpu::Device, size: u64) -> wgpu::Buffer { + device.create_buffer(&wgpu::BufferDescriptor { + label: Some("Staging Buffer"), + size, + usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }) +} + +/// Initializes a GPU buffer with provided data. +fn create_buffer_init( + device: &wgpu::Device, + label: &str, + data: &[T], + usage: wgpu::BufferUsages, +) -> wgpu::Buffer { + device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some(label), + contents: bytemuck::cast_slice(data), + usage, + }) +} + +/// Creates an empty GPU buffer with specified size and usage. +fn create_buffer( + device: &wgpu::Device, + label: &str, + size: u64, + usage: wgpu::BufferUsages, +) -> wgpu::Buffer { + device.create_buffer(&wgpu::BufferDescriptor { + label: Some(label), + size, + usage, + mapped_at_creation: false, + }) +} + +/// Safely casts a byte slice to a `u32` slice, padding with zeros if necessary. +/// +/// Panics if the byte slice cannot be aligned even after padding. +fn pad_and_cast_spirv(bytes: &[u8]) -> Vec { + use bytemuck::cast_slice; + let mut padded = bytes.to_vec(); + + // Pad with zeros to make the length a multiple of 4 + while padded.len() % 4 != 0 { + padded.push(0); + } + + // Ensure the starting pointer is aligned to 4 bytes + if padded.as_ptr() as usize % std::mem::align_of::() != 0 { + panic!("Shader binary is not 4-byte aligned even after padding."); + } + + // Safe to cast since we've ensured alignment and length + cast_slice::(&padded).to_vec() +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs new file mode 100644 index 0000000..19de411 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs @@ -0,0 +1,109 @@ +#![allow(opaque_hidden_inferred_bound)] + +use glam::UVec3; +use settings::Dimensions; +use std::fmt::Display; +use std::future::Future; + +mod backends; +pub mod variants; + +pub trait MatrixMultiply: Display { + fn new(variant: T) -> impl Future + Send; + fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec; +} + +/// Matrix multiplication logic that can be run on the CPU. +pub trait Cpu { + fn call( + &self, + global_id: UVec3, + dimensions: &Dimensions, + a: &[f32], + b: &[f32], + results: &mut [f32], + ); +} + +/// Matrix multiplication logic that can be run on the CPU. +pub trait Gpu { + fn compiled_shader(&self) -> &[u8]; + fn entry_point(&self) -> &'static str { + settings::SHADER_ENTRY_POINT + } +} + +/// How to dispatch work. +pub trait GridComputation { + fn workgroup(&self) -> UVec3; + fn dispatch_count(&self, m: u32, n: u32) -> UVec3; +} + +pub mod naive { + use super::*; + use crate::backends::wgpu::MatrixMultiplier; + + pub fn wgpu() -> MatrixMultiplier { + futures::executor::block_on(backends::wgpu::MatrixMultiplier::new(variants::Naive)) + } +} + +pub mod workgroup_256 { + use super::*; + use crate::backends::wgpu::MatrixMultiplier; + + pub fn wgpu() -> MatrixMultiplier { + futures::executor::block_on(backends::wgpu::MatrixMultiplier::new( + variants::Workgroup256, + )) + } +} + +pub mod workgroup_2d { + use super::*; + use crate::backends::wgpu::MatrixMultiplier; + + pub fn wgpu() -> MatrixMultiplier { + futures::executor::block_on(MatrixMultiplier::new(variants::Workgroup2d)) + } +} + +pub mod tiling_1d { + use super::*; + use crate::backends::wgpu::MatrixMultiplier; + + pub fn wgpu() -> MatrixMultiplier { + futures::executor::block_on(MatrixMultiplier::new(variants::Tiling1d)) + } +} + +pub mod tiling_2d_simd { + use super::*; + use crate::backends::wgpu::MatrixMultiplier; + + pub fn wgpu() -> MatrixMultiplier { + futures::executor::block_on(MatrixMultiplier::new(variants::Tiling2dSimd)) + } +} + +pub mod isomorphic { + use super::*; + use crate::backends::wgpu::MatrixMultiplier; + + pub fn wgpu() -> MatrixMultiplier { + futures::executor::block_on(MatrixMultiplier::new(variants::Isomorphic)) + } + + pub mod cpu { + use super::*; + use crate::backends::cpu::{MultiThreadedMatMul, SingleThreadedMatMul}; + + pub fn single_threaded() -> SingleThreadedMatMul { + futures::executor::block_on(SingleThreadedMatMul::new(variants::Isomorphic)) + } + + pub fn multi_threaded() -> MultiThreadedMatMul { + futures::executor::block_on(MultiThreadedMatMul::new(variants::Isomorphic)) + } + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs new file mode 100644 index 0000000..357d73b --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs @@ -0,0 +1,197 @@ +//! Different implementations of matrix multiplication and the metadata that defines how +//! they run. + +use crate::{Cpu, Gpu, GridComputation}; +use glam::UVec3; +use settings::Dimensions; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; + +/// Naive GPU implementation of matrix multiplication. +pub struct Naive; + +impl Display for Naive { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "naive") + } +} + +impl Gpu for Naive { + fn compiled_shader(&self) -> &[u8] { + compiled_naive::SHADER_BINARY + } +} + +impl GridComputation for Naive { + fn workgroup(&self) -> UVec3 { + UVec3::new(1, 1, 1) + } + + fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { + UVec3::new(m * n, 1, 1) + } +} + +/// GPU implementation of matrix multiplication with a workgroup of 256. +pub struct Workgroup256; + +impl Display for Workgroup256 { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "workgroup_256") + } +} + +impl Gpu for Workgroup256 { + fn compiled_shader(&self) -> &[u8] { + compiled_workgroup_256::SHADER_BINARY + } +} + +impl GridComputation for Workgroup256 { + fn workgroup(&self) -> UVec3 { + UVec3::new(256, 1, 1) + } + + fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { + let workgroup = self.workgroup(); + let threads_needed = m * n; + // This ceil division is needed because Rust handles truncation differently than + // Typescript/Javascript so we might get 0. + // We'll also cap the value to a maximum of 65,535 to comply with hardware limits. + let x = ((threads_needed as f32 / workgroup.x as f32).ceil() as u32).min(65_535); + UVec3::new(x, 1, 1) + } +} + +/// GPU implementation of matrix multiplication with a two-dimensional workgroup. +pub struct Workgroup2d; + +impl Display for Workgroup2d { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "workgroup_2d") + } +} + +impl Gpu for Workgroup2d { + fn compiled_shader(&self) -> &[u8] { + compiled_workgroup_2d::SHADER_BINARY + } +} + +impl GridComputation for Workgroup2d { + fn workgroup(&self) -> UVec3 { + UVec3::new(8, 8, 1) + } + + fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { + let w = self.workgroup(); + let workgroup_size = w.x + w.y; + let x = ((m as f32) / (workgroup_size as f32)).ceil() as u32; + let y = ((n as f32) / (workgroup_size as f32)).ceil() as u32; + UVec3::new(x, y, 1) + } +} + +/// GPU implementation of matrix multiplication with one-dimensional tiling. +pub struct Tiling1d; + +impl Display for Tiling1d { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "tiling_1d") + } +} + +impl Gpu for Tiling1d { + fn compiled_shader(&self) -> &[u8] { + compiled_tiling_1d::SHADER_BINARY + } +} + +impl GridComputation for Tiling1d { + fn workgroup(&self) -> UVec3 { + UVec3::new(16, 16, 1) + } + + fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { + let workgroup = self.workgroup(); + UVec3::new( + (m + workgroup.x - 1) / workgroup.x, + (n + workgroup.y - 1) / workgroup.y, + 1, + ) + } +} + +/// GPU implementation of matrix multiplication with two-dimensional tiling. +pub struct Tiling2dSimd; + +impl Display for Tiling2dSimd { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "tiling_2d_simd") + } +} + +impl Gpu for Tiling2dSimd { + fn compiled_shader(&self) -> &[u8] { + compiled_tiling_2d_simd::SHADER_BINARY + } +} + +impl GridComputation for Tiling2dSimd { + fn workgroup(&self) -> UVec3 { + UVec3::new(16, 16, 1) + } + + fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { + let workgroup = self.workgroup(); + UVec3::new( + (m + workgroup.x - 1) / workgroup.x, + (n + workgroup.y - 1) / workgroup.y, + 1, + ) + } +} + +/// GPU implementation of matrix multiplication that runs on both the CPU and GPU. +pub struct Isomorphic; + +impl Display for Isomorphic { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "isomorphic") + } +} + +impl Gpu for Isomorphic { + fn compiled_shader(&self) -> &[u8] { + compiled_isomorphic::SHADER_BINARY + } +} + +impl Cpu for Isomorphic { + fn call( + &self, + global_id: UVec3, + dimensions: &Dimensions, + a: &[f32], + b: &[f32], + results: &mut [f32], + ) { + ::isomorphic::matmul(global_id, &dimensions, &a, &b, results); + } +} + +impl GridComputation for Isomorphic { + fn workgroup(&self) -> UVec3 { + UVec3::new(16, 16, 1) + } + + fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { + let workgroup = self.workgroup(); + UVec3::new( + (m + workgroup.x - 1) / workgroup.x, + (n + workgroup.y - 1) / workgroup.y, + 1, + ) + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/Cargo.lock new file mode 100644 index 0000000..77f6ff7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/Cargo.lock @@ -0,0 +1,149 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "glam" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc46dd3ec48fdd8e693a98d2b8bafae273a2d54c1de02a2a7e3d57d501f39677" +dependencies = [ + "libm", +] + +[[package]] +name = "gpu" +version = "0.1.0" +dependencies = [ + "shared", + "spirv-std", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shared" +version = "0.1.0" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "spirv-std" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "bitflags", + "glam", + "num-traits", + "spirv-std-macros", + "spirv-std-types", +] + +[[package]] +name = "spirv-std-macros" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "proc-macro2", + "quote", + "spirv-std-types", + "syn 1.0.109", +] + +[[package]] +name = "spirv-std-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/Cargo.toml new file mode 100644 index 0000000..ea4567e --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "naive" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["dylib", "lib"] + +[dependencies] +settings = { path = "../../shared/settings"} +spirv-std.workspace = true diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/src/lib.rs new file mode 100644 index 0000000..9dad2d6 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/naive/src/lib.rs @@ -0,0 +1,30 @@ +#![no_std] + +use settings::Dimensions; +use spirv_std::glam::UVec3; +use spirv_std::spirv; + +#[spirv(compute(threads(1)))] +pub fn matmul( + #[spirv(global_invocation_id)] global_id: UVec3, + #[spirv(uniform, descriptor_set = 0, binding = 0)] dimensions: &Dimensions, + #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] a: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] b: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 3)] result: &mut [f32], +) { + let index = global_id.x; + let row = index / dimensions.n; + let col = index % dimensions.n; + + if index < dimensions.m * dimensions.n { + let mut sum = 0.0; + + for i in 0..dimensions.k { + let a_val = a[(row * dimensions.k + i) as usize]; + let b_val = b[(i * dimensions.n + col) as usize]; + sum += a_val * b_val; + } + + result[(row * dimensions.n + col) as usize] = sum; + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/Cargo.lock new file mode 100644 index 0000000..77f6ff7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/Cargo.lock @@ -0,0 +1,149 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "glam" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc46dd3ec48fdd8e693a98d2b8bafae273a2d54c1de02a2a7e3d57d501f39677" +dependencies = [ + "libm", +] + +[[package]] +name = "gpu" +version = "0.1.0" +dependencies = [ + "shared", + "spirv-std", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shared" +version = "0.1.0" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "spirv-std" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "bitflags", + "glam", + "num-traits", + "spirv-std-macros", + "spirv-std-types", +] + +[[package]] +name = "spirv-std-macros" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "proc-macro2", + "quote", + "spirv-std-types", + "syn 1.0.109", +] + +[[package]] +name = "spirv-std-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/Cargo.toml new file mode 100644 index 0000000..19e8eff --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "tiling_1d" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["dylib", "lib"] + +[dependencies] +settings = { path = "../../shared/settings"} +spirv-std.workspace = true diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/src/lib.rs new file mode 100644 index 0000000..766a4fa --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d/src/lib.rs @@ -0,0 +1,40 @@ +#![no_std] + +use settings::Dimensions; +use settings::TILE_SIZE; +use spirv_std::glam::UVec3; +use spirv_std::spirv; + +#[spirv(compute(threads(16, 16)))] +pub fn matmul( + #[spirv(global_invocation_id)] global_id: UVec3, + #[spirv(uniform, descriptor_set = 0, binding = 0)] dimensions: &Dimensions, + #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] a: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] b: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 3)] result: &mut [f32], +) { + let row = global_id.y as usize; + let col = (global_id.x * TILE_SIZE) as usize; + + if row >= dimensions.m as usize || col >= dimensions.n as usize { + return; + } + + let mut sum00: f32 = 0.0; + let mut sum01: f32 = 0.0; + let mut sum02: f32 = 0.0; + let mut sum03: f32 = 0.0; + + for i in 0..dimensions.k as usize { + let a_elem = a[row * dimensions.k as usize + i]; + sum00 += a_elem * b[i * dimensions.n as usize + col]; + sum01 += a_elem * b[i * dimensions.n as usize + col + 1]; + sum02 += a_elem * b[i * dimensions.n as usize + col + 2]; + sum03 += a_elem * b[i * dimensions.n as usize + col + 3]; + } + + result[row * dimensions.n as usize + col] = sum00; + result[row * dimensions.n as usize + col + 1] = sum01; + result[row * dimensions.n as usize + col + 2] = sum02; + result[row * dimensions.n as usize + col + 3] = sum03; +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/Cargo.lock new file mode 100644 index 0000000..77f6ff7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/Cargo.lock @@ -0,0 +1,149 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "glam" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc46dd3ec48fdd8e693a98d2b8bafae273a2d54c1de02a2a7e3d57d501f39677" +dependencies = [ + "libm", +] + +[[package]] +name = "gpu" +version = "0.1.0" +dependencies = [ + "shared", + "spirv-std", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shared" +version = "0.1.0" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "spirv-std" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "bitflags", + "glam", + "num-traits", + "spirv-std-macros", + "spirv-std-types", +] + +[[package]] +name = "spirv-std-macros" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "proc-macro2", + "quote", + "spirv-std-types", + "syn 1.0.109", +] + +[[package]] +name = "spirv-std-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/Cargo.toml new file mode 100644 index 0000000..6ea7ef4 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "tiling_2d_simd" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["dylib", "lib"] + +[dependencies] +settings = { path = "../../shared/settings" } +spirv-std.workspace = true diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/src/lib.rs new file mode 100644 index 0000000..4f8d4fc --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_2d_simd/src/lib.rs @@ -0,0 +1,57 @@ +#![no_std] + +use settings::Dimensions; +use settings::{TILE_M, TILE_N}; + +use spirv_std::glam::UVec3; +use spirv_std::spirv; + +#[spirv(compute(threads(16, 16)))] +pub fn matmul( + #[spirv(global_invocation_id)] global_id: UVec3, + #[spirv(uniform, descriptor_set = 0, binding = 0)] dimensions: &Dimensions, + #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] a: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] b: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 3)] result: &mut [f32], +) { + let row = (global_id.y * TILE_M as u32) as usize; + let col = (global_id.x * TILE_N as u32) as usize; + + // Initialize sums array to zeros + // Note: This is uglier than it needs to be to work around + // https://github.com/Rust-GPU/rust-gpu/issues/46 + let mut sums: [[f32; TILE_N as usize]; TILE_M as usize] = Default::default(); + + // Compute the 2D tile + for k in 0..dimensions.k as usize { + for i in 0..TILE_M as usize { + let a_element = if row + i < dimensions.m as usize { + a[(row + i) * dimensions.k as usize + k] + } else { + 0.0 + }; + + for j in 0..TILE_N as usize { + let b_element = if col + j < dimensions.n as usize { + b[k * dimensions.n as usize + (col + j as usize)] + } else { + 0.0 + }; + + sums[i][j] += a_element * b_element; + } + } + } + + // Write results + for i in 0..TILE_M as usize { + for j in 0..TILE_N as usize { + let output_row = row + i as usize; + let output_col = col + j as usize; + + if output_row < dimensions.m as usize && output_col < dimensions.n as usize { + result[output_row * dimensions.n as usize + output_col] = sums[i][j]; + } + } + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/Cargo.lock new file mode 100644 index 0000000..77f6ff7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/Cargo.lock @@ -0,0 +1,149 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "glam" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc46dd3ec48fdd8e693a98d2b8bafae273a2d54c1de02a2a7e3d57d501f39677" +dependencies = [ + "libm", +] + +[[package]] +name = "gpu" +version = "0.1.0" +dependencies = [ + "shared", + "spirv-std", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shared" +version = "0.1.0" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "spirv-std" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "bitflags", + "glam", + "num-traits", + "spirv-std-macros", + "spirv-std-types", +] + +[[package]] +name = "spirv-std-macros" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "proc-macro2", + "quote", + "spirv-std-types", + "syn 1.0.109", +] + +[[package]] +name = "spirv-std-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/Cargo.toml new file mode 100644 index 0000000..9552c09 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "workgroup_256" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["dylib", "lib"] + +[dependencies] +settings = { path = "../../shared/settings" } +spirv-std.workspace = true diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/src/lib.rs new file mode 100644 index 0000000..7a6af3d --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_256/src/lib.rs @@ -0,0 +1,30 @@ +#![no_std] + +use settings::Dimensions; +use spirv_std::glam::UVec3; +use spirv_std::spirv; + +#[spirv(compute(threads(256)))] +pub fn matmul( + #[spirv(global_invocation_id)] global_id: UVec3, + #[spirv(uniform, descriptor_set = 0, binding = 0)] dimensions: &Dimensions, + #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] a: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] b: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 3)] result: &mut [f32], +) { + let index = global_id.x; + let row = index / dimensions.n; + let col = index % dimensions.n; + + if index < dimensions.m * dimensions.n { + let mut sum = 0.0; + + for i in 0..dimensions.k { + let a_val = a[(row * dimensions.k + i) as usize]; + let b_val = b[(i * dimensions.n + col) as usize]; + sum += a_val * b_val; + } + + result[(row * dimensions.n + col) as usize] = sum; + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/Cargo.lock new file mode 100644 index 0000000..77f6ff7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/Cargo.lock @@ -0,0 +1,149 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "glam" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc46dd3ec48fdd8e693a98d2b8bafae273a2d54c1de02a2a7e3d57d501f39677" +dependencies = [ + "libm", +] + +[[package]] +name = "gpu" +version = "0.1.0" +dependencies = [ + "shared", + "spirv-std", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shared" +version = "0.1.0" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "spirv-std" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "bitflags", + "glam", + "num-traits", + "spirv-std-macros", + "spirv-std-types", +] + +[[package]] +name = "spirv-std-macros" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "proc-macro2", + "quote", + "spirv-std-types", + "syn 1.0.109", +] + +[[package]] +name = "spirv-std-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/Cargo.toml new file mode 100644 index 0000000..00ee2fb --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "workgroup_2d" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["dylib", "lib"] + +[dependencies] +settings = { path = "../../shared/settings" } +spirv-std.workspace = true diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/src/lib.rs new file mode 100644 index 0000000..54e5021 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/workgroup_2d/src/lib.rs @@ -0,0 +1,25 @@ +#![no_std] + +use settings::Dimensions; +use spirv_std::glam::UVec3; +use spirv_std::spirv; + +#[spirv(compute(threads(8, 8)))] +pub fn matmul( + #[spirv(global_invocation_id)] global_id: UVec3, + #[spirv(uniform, descriptor_set = 0, binding = 0)] dimensions: &Dimensions, + #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] a: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] b: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 3)] result: &mut [f32], +) { + let row = global_id.x as usize; + let col = global_id.y as usize; + + if row < dimensions.m as usize && col < dimensions.n as usize { + let mut sum = 0.0; + for i in 0..dimensions.k as usize { + sum += a[row * dimensions.k as usize + i] * b[i * dimensions.n as usize + col]; + } + result[row * dimensions.n as usize + col] = sum; + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/isomorphic/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/isomorphic/Cargo.toml new file mode 100644 index 0000000..d15a18a --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/isomorphic/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "isomorphic" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["dylib", "lib"] + +# Dependencies when run on either the CPU or GPU +[dependencies] +settings = { path = "../../shared/settings" } + +# Dependencies when run on the CPU +[target.'cfg(not(target_arch = "spirv"))'.dependencies] +glam.workspace = true + +# Dependencies when run on the GPU +[target.'cfg(target_arch = "spirv")'.dependencies] +spirv-std.workspace = true diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/isomorphic/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/isomorphic/src/lib.rs new file mode 100644 index 0000000..88f23c6 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/isomorphic/src/lib.rs @@ -0,0 +1,82 @@ +//! This shader can run on both the CPU and the GPU. +//! +//! The GPU-specific attributes are only used when compiling for the GPU, otherwise they +//! are stripped away and the shader entrypoint becomes a normal function that can be +//! called from the CPU. + +#![no_std] + +use settings::Dimensions; +use settings::{TILE_M, TILE_N}; + +#[cfg(target_arch = "spirv")] +use spirv_std::spirv; + +#[cfg(target_arch = "spirv")] +use spirv_std::glam; + +#[cfg(not(target_arch = "spirv"))] +use glam; + +use glam::UVec3; + +#[cfg_attr(target_arch = "spirv", spirv(compute(threads(16, 16))))] +pub fn matmul( + #[cfg_attr(target_arch = "spirv", spirv(global_invocation_id))] global_id: UVec3, + #[cfg_attr(target_arch = "spirv", spirv(uniform, descriptor_set = 0, binding = 0))] + dimensions: &Dimensions, + #[cfg_attr( + target_arch = "spirv", + spirv(storage_buffer, descriptor_set = 0, binding = 1) + )] + a: &[f32], + #[cfg_attr( + target_arch = "spirv", + spirv(storage_buffer, descriptor_set = 0, binding = 2) + )] + b: &[f32], + #[cfg_attr( + target_arch = "spirv", + spirv(storage_buffer, descriptor_set = 0, binding = 3) + )] + result: &mut [f32], +) { + let row = (global_id.y * TILE_M as u32) as usize; + let col = (global_id.x * TILE_N as u32) as usize; + + // Initialize sums array to zeros + let mut sums: [[f32; TILE_N as usize]; TILE_M as usize] = Default::default(); + + // Compute the 2D tile + for k in 0..dimensions.k as usize { + for i in 0..TILE_M as usize { + let a_element = if row + i < dimensions.m as usize { + a[(row + i) * dimensions.k as usize + k] + } else { + 0.0 + }; + + for j in 0..TILE_N as usize { + let b_element = if col + j < dimensions.n as usize { + b[k * dimensions.n as usize + (col + j as usize)] + } else { + 0.0 + }; + + sums[i][j] += a_element * b_element; + } + } + } + + // Write results + for i in 0..TILE_M as usize { + for j in 0..TILE_N as usize { + let output_row = row + i as usize; + let output_col = col + j as usize; + + if output_row < dimensions.m as usize && output_col < dimensions.n as usize { + result[output_row * dimensions.n as usize + output_col] = sums[i][j]; + } + } + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/Cargo.lock new file mode 100644 index 0000000..5b4ed9a --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/Cargo.lock @@ -0,0 +1,65 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shared" +version = "0.1.0" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/Cargo.toml new file mode 100644 index 0000000..55659f8 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "settings" +version = "0.1.0" +edition = "2021" + +[dependencies] +bytemuck = { version = "1.9", features = ["derive"] } + +# Conditionally include `glam` only when not on the `spirv` target. +[target.'cfg(not(target_arch = "spirv"))'.dependencies] +glam = { workspace = true } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/src/lib.rs new file mode 100644 index 0000000..0b05522 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/shared/settings/src/lib.rs @@ -0,0 +1,88 @@ +#![cfg_attr(target_arch = "spirv", no_std)] + +#[repr(C)] +#[derive(Copy, Clone, Debug, bytemuck::Pod, bytemuck::Zeroable)] +pub struct Dimensions { + pub m: u32, + pub k: u32, + pub n: u32, +} + +#[cfg(not(target_arch = "spirv"))] +use glam::UVec3; + +#[cfg(not(target_arch = "spirv"))] +impl From for Dimensions { + fn from(uvec: UVec3) -> Self { + Self { + m: uvec.x, + k: uvec.y, + n: uvec.z, + } + } +} + +impl Into<(u32, u32, u32)> for Dimensions { + fn into(self) -> (u32, u32, u32) { + (self.m, self.k, self.n) + } +} + +impl Dimensions { + pub fn new(m: u32, k: u32, n: u32) -> Self { + Self { m, k, n } + } +} + +// Tiling configurations +pub const TILE_SIZE: u32 = 4; +pub const TILE_M: u32 = 4; +pub const TILE_N: u32 = 4; + +// Buffer layout information +#[derive(Copy, Clone, Debug)] +pub struct BufferLayout { + pub binding: u32, + pub readonly: bool, +} + +impl BufferLayout { + pub const DIMENSIONS: Self = Self { + binding: 0, + readonly: true, + }; + pub const A_MATRIX: Self = Self { + binding: 1, + readonly: true, + }; + pub const B_MATRIX: Self = Self { + binding: 2, + readonly: true, + }; + pub const RESULT: Self = Self { + binding: 3, + readonly: false, + }; +} + +pub const NUM_BUFFERS: usize = 3; + +pub const SHADER_ENTRY_POINT: &str = "matmul"; + +// Helper functions for index calculations +#[inline] +pub fn get_matrix_index(row: u32, col: u32, stride: u32) -> usize { + (row * stride + col) as usize +} + +pub fn validate_dimensions(a_dims: (u32, u32), b_dims: (u32, u32)) -> bool { + a_dims.1 == b_dims.0 +} + +pub fn get_output_dimensions(a_dims: (u32, u32), b_dims: (u32, u32)) -> Option<(u32, u32)> { + if validate_dimensions(a_dims, b_dims) { + Some((a_dims.0, b_dims.1)) + } else { + None + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/rust-toolchain.toml b/blog/2024-11-21-optimizing-matrix-mul/code/rust-toolchain.toml new file mode 100644 index 0000000..b901355 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/rust-toolchain.toml @@ -0,0 +1,3 @@ + [toolchain] + channel = "nightly-2024-04-24" + components = ["rust-src", "rustc-dev", "llvm-tools"] diff --git a/blog/2024-11-21-optimizing-matrix-mul/index.md b/blog/2024-11-21-optimizing-matrix-mul/index.md new file mode 100644 index 0000000..d094696 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/index.md @@ -0,0 +1,499 @@ +--- +title: "Optimizing a Rust GPU matmul kernel for 1TFLOP+ performance" +authors: ["LegNeato"] +slug: optimizing-matmul +tags: ["demo", "code", "performance"] +draft: true +--- + +I read the excellent post [Optimizing a WebGPU Matmul Kernel for 1TFLOP+ +Performance](https://www.nuss-and-bolts.com/p/optimizing-a-webgpu-matmul-kernel) by Zach +Nussbaum and thought it might be fun to reimplement it with [Rust GPU](https://Rust-gpu.github.io/). + +We'll follow Zach's original post closely, comparing and contrasting using Rust vs the +WGSL and Typescript from his post. + +At the end, I'll show some additional fun things that are enabled by using Rust on the +GPU. + + + +:::tip + +The complete runnable code can be found on GitHub. + +::: + +## What is Rust GPU? + +[Rust GPU](https://Rust-gpu.github.io/) is a project that allows you to write code for +GPUs using the Rust programming language. GPUs are typically programmed using +specialized languages like [WGSL](https://www.w3.org/TR/WGSL/), +[GLSL](https://developer.mozilla.org/en-US/docs/Games/Techniques/3D_on_the_web/GLSL_Shaders), +[MSL](https://developer.apple.com/documentation/metal/performing_calculations_on_a_gpu), +or +[HLSL](https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl). +Rust GPU changes this by letting you use Rust to write GPU programs (often called +"shaders" or "kernels"). + +These Rust GPU programs are then compiled into [SPIR-V](https://www.khronos.org/spir/), +a low-level format that [most GPUs understand](https://vulkan.gpuinfo.org/). Since +SPIR-V is the format [Vulkan](https://www.vulkan.org/) uses, Rust GPU makes it possible +to integrate Rust-based GPU programs into any Vulkan-compatible workflow. + +For more details, check out the [Rust GPU website](https://Rust-gpu.github.io/) or the [GitHub repository](https://github.com/Rust-gpu/Rust-gpu). + +## How does Rust GPU work? + +Rust GPU focuses purely on compiling your Rust code into SPIR-V. This compiled code is +what the GPU executes. However, Rust GPU doesn't dictate how you handle CPU-to-GPU +communication or data transfer. You're free to choose a host CPU library written in +whatever language that fits your project. Some popular options in Rust include: + +- **[ash](https://github.com/ash-rs/ash)**: Low-level Vulkan bindings for Rust, providing maximum control over Vulkan operations. +- **[vulkano](https://github.com/vulkano-rs/vulkano)**: A higher-level Vulkan library that simplifies common tasks. +- **[wgpu](https://github.com/gfx-rs/wgpu)**: A cross-platform library that abstracts GPU operations across Vulkan, DirectX, Metal, and WebGPU. + +But again, you don't _have_ to use Rust for the CPU-side when using Rust on the GPU—any +language will do. + +## What will we use? + +In Zach's post, he writes his GPU programs in [WGSL](https://www.w3.org/TR/WGSL/). These +programs and their data are sent to and from the GPU via Typescript which talks to the +[WebGPU](https://en.wikipedia.org/wiki/WebGPU) CPU code built into the browser. + +We'll take a different approach: writing GPU programs in Rust via Rust GPU and managing +everything—including the CPU-side code—in Rust. This means both the GPU programs and the +code controlling them will be written in the same language. If you are familiar with web +programming, what we are doing is conceptually similar to Javascript running on both the +server and the client. + +Using Rust for both CPU and GPU has advantages, like consistent tooling and shared code. +But it also means we need to be clear about which code runs where. I've tried to make +sure this distinction is easy to follow. + +To handle communication between our code on the CPU and GPU, we'll use +[`wgpu`](https://github.com/gfx-rs/wgpu). `wgpu` is a high-level Rust library that +implements the WebGPU API. On the web, it works directly with the browser's WebGPU +implementation. On native platforms, it translates API calls to the platform's GPU API +(Vulkan, DirectX, or Metal). This lets us run the same code on a wide range of +platforms, including Windows, Linux, macOS, iOS[^1], Android, and the web[^2]. + +By using Rust GPU and `wgpu`, we have a clean, portable setup with everything written in +Rust. + +## Writing the kernel + +### Kernel 1: Naive kernel + +The simplest way to compute a dot product between matrix A and B and write +to matrix C is for each row in A (of shape M), iterate over the columns of A (of shape +K) and multiply by the corresponding value of B. + +Here, we have our first difference from Zach's post. In WGSL, you must define inputs at +the top-level scope: + +import { WebGpuInputs } from './snippets/naive.tsx'; + + + +And then write your kernel: + +import { WebGpuKernel } from './snippets/naive.tsx'; + + + +With Rust GPU, we specify the inputs as arguments to the kernel and configure them with +[procedural macros](https://doc.Rust-lang.org/reference/procedural-macros.html): + +import { RustNaiveInputs } from './snippets/naive.tsx'; + + + +This code looks like normal Rust code but _runs entirely on the GPU._ + +There are a couple of things to note about the Rust implementation: + +1. The kernel uses the regular Rust + [`#![no_std]`](https://www.reddit.com/r/Rust/comments/9eyc21/noob_what_exactly_is_no_std_and_why_is_it_so/) + attribute, which is required because GPUs do not have access to Rust's standard + library (`std`). Instead, you rely on `core` and `spirv_std` to provide `std`-like + functionality. +2. Libraries are imported via `use`. The module system works exactly the same as regular Rust. +3. We're importing a vendored copy of [`glam`](https://github.com/bitshifter/glam-rs). + This is the exact `glam` crate from [crates.io](https://crates.io/crates/glam). +4. The inner loop (`for i in 0..dimensions.k`) uses Rust's `for` syntax with a range. + This is a higher-level abstraction compared to manually iterating with an index in + other shader languages like WGSL, GLSL, or HLSL. +5. Read-only inputs are immutable references (`&Dimensions` / `&[f32]`) and writeable outputs are + mutable references (`&mut [f32]`). This feels very familiar to anyone used to writing + Rust. + +#### What's with all the `usize`? + +Rust defines `usize` as [the native pointer width of the hardware the code is running +on](https://doc.rust-lang.org/std/primitive.usize.html). This is important because Rust +uses `usize` for indexing slices to ensure that access is properly pointer-aligned. + +On most GPU hardware, `usize` is effectively equivalent to `u32`. But the Rust compiler +doesn't assume that. It can't, because doing so could introduce problems—like if you ran +this code on hardware where `usize` is actually `u64`. Rust won't let you implicitly +treat a `u32` as a `usize`. You have to explicitly cast it, essentially telling the +compiler "I know this is safe for my target hardware." + +This explicitness might seem tedious but it is one of the ways Rust prevents subtle +bugs. It forces you to think about whether your assumptions about hardware alignment and +pointer sizes are correct, making your code more portable and reliable. + +:::info + +Matrix multiplication is a pathological case with lots of indexing and row and column +calculations. Most Rust GPU code does not have nearly as many `usize` casts as these +examples. + +::: + +### Kernel 2: Moarrr threads! + +With the first kernel, we're only able to compute small square matrices due to limits on +the number of workgroups you can dispatch at once. + +Since we're launching one workgroup per entry, a 256x256 matrix is larger than our +limit! + +Remember this line? + +import { RustNaiveWorkgroup } from './snippets/naive.tsx'; + + + +We can reduce the number of dispatched workgroups by increasing the number of threads per workgroup! + +If we update our GPU code + +import { RustWorkgroup256Workgroup } from './snippets/workgroup_256.tsx'; + + + +we can reduce the number of total dispatched workgroups per dimension: + +import { RustWorkgroup256WorkgroupCount } from './snippets/workgroup_256.tsx'; + + + +The `dispatch_count()` function runs on the CPU and is used by the CPU-to-GPU API (in +our case `wgpu`) to configure and dispatch to the GPU: + +import { RustWorkgroup256WgpuDispatch } from './snippets/workgroup_256.tsx'; + + + +:::warning + +This code appears more complicated than it needs to be. I abstracted the CPU-side code +that talks to the GPU using generics and traits so I could easily slot in different +kernels and their settings while writing the blog post. + +You could just hardcode a value for simplicity. + +::: + +### Kernel 3: Calculating with 2D workgroups + +However doing all the computation in "1 dimension" limits the matrix size we can +calculate. + +Although we don't change much about our code, if we distribute our work in 2 dimensions +we're able to bypass these limits and launch more workgroups that are larger. This +allows us to calculate a 4096x4096 matmul. + +We update our `compute(threads(256)))` to `compute(threads((8, 8)))`, and make the small +change to `row` and `col` from Zach's post to increase speed: + +import { RustWorkgroup2d } from './snippets/workgroup_2d.tsx'; + + + +And we need to tweak the workgroup dispatch count calculation on the CPU as we are in 2D +now and using the `y` value: + +import { RustWorkgroup2dWorkgroupCount } from './snippets/workgroup_2d.tsx'; + + + +### Kernel 4: Kernel tiling + +Another thing to consider is how much work each thread does. + +Up to now, each thread only computes one entry. But there is some overhead to launching +each workgroup versus computing more than 1 element per thread! + +If calculating more elements per thread is faster than the overhead to launch each +workgroup, we should see a big speedup. + +To do so, we calculate 4 results per thread (e.g. a 1x4 Tile). + +import { RustTiling1d } from './snippets/tiling_1d.tsx'; + + + +The kernel looks roughly the same as before except we've unrolled the computation and +are calculating `TILE_SIZE` results per thread. + +We can take this a step further and calculate 2D results per thread! Instead of +calculating 4 elements per single row, we can calculate 4 elements for 4 rows (e.g. a 2D +tile). + +import { RustTiling2dSimd } from './snippets/tiling_2d_simd.tsx'; + + + +Each thread now calculates a 4x4 grid of the output matrix and we see a slight +improvement over the last kernel. + +## Reflections on porting to Rust GPU + +Porting to Rust GPU went quickly, as the kernels Zach used were fairly simple. Most of +the time was spent with concerns that were not specifically about writing GPU code. For +example, deciding how much to abstract vs how much to make the code easy to follow, if +everything should be available at runtime or if each kernel should be a compilation +target, etc. The code is still not _great_ as it is still blog post code! + +My background is not in GPU programming, but I do have Rust experience. I joined the +Rust GPU project because I tried to use standard GPU languages and knew there must be a +better way. Writing these GPU kernels felt like writing any other Rust code (other than +debugging, more later) which is a huge win to me. Not only the language itself, but the +entire development experience. + +## Rust-specific party tricks + +Rust lets us write code for both the CPU and GPU in ways that are often impossible—or at +least less elegant—with other languages. I'm going to highlight some benefits of Rust I +experienced while working on this blog post. + +### Shared code across GPU and CPU + +In GPU programming, we often need to pass data between the CPU and GPU. For example, our +GPU kernel expects a `Dimensions` struct as input: + +import { RustPartySettings } from './snippets/party.tsx'; + + + +We create an instance of `Dimensions` on the CPU and send it to the GPU via `wgpu`, +where the Rust kernel loads and uses it. + +import { RustWgpuDimensions } from './snippets/party.tsx'; + + + +This means the code on the CPU and GPU need to agree on the definition of +`Dimensions`! + +In many GPU programming ecosystems, this would involve manually keeping the +definitions in sync across different languages—one for the CPU, one for the GPU. This is +tedious and error-prone. + +With Rust, it's straightforward: we move the `Dimensions` struct into its own crate, and +both the CPU and GPU code depend on that crate. Now, the type definition lives in one +place and both platforms use it directly. + +This approach eliminates duplication and guarantees consistency. If we need to make +changes, those changes propagate to both the CPU and GPU automatically, reducing the +risk of mismatches and making refactoring far safer. + +This kind of consistency across CPU and GPU is something you don't often see in other +GPU programming ecosystems. Bespoke codegen solutions are often created to accomplish +the same thing Rust has built in. + +### Running and debugging shaders on the CPU + +GPU code can be notoriously hard to debug. While developing this kernel, I ran into a +bug I couldn't figure out. GPU debugging tools are limited and `printf`-style debugging +often isn't available. But what if we could run the GPU kernel _on the CPU_, where we +have access to tools like standard debuggers and good ol' `printf`/`println`? + +With Rust GPU, this was straightforward. By using `cfg()` directives I made the +GPU-specific annotations (`#[spirv(...)]`) disappear when compiling for the CPU. The +result? The kernel became a regular Rust function. On the GPU, it behaves like a shader. +On the CPU, it's just a function you can call directly. + +Here's what it looks like in practice using the 2D tiling kernel from before: + +import { RustIsomorphic } from './snippets/party.tsx'; + + + +The logic in the kernel hasn't changed, it is exactly the same as the GPU-only code from +before. + +You'll also notice that on the GPU it uses `glam` from `spirv_std` but on the CPU it +uses `glam` from crates.io: + +import { RustIsomorphicGlam } from './snippets/party.tsx'; + + + +This is enabled by the standard Rust ecosystem tooling +around dependencies: + +import { RustIsomorphicDeps } from './snippets/party.tsx'; + + + +Testing the kernel in isolation is useful, but it does not reflect how the GPU executes +it with multiple invocations across workgroups and dispatches. To test the kernel +end-to-end, I needed a test harness that simulated this behavior on the CPU. + +Building the harness was straightforward. By enforcing the same invariants as the GPU I +could validate the kernel under the same conditions the GPU would run it: + +import { RustCpuBackendHarness } from './snippets/party.tsx'; + + + +:::warning + +Again, this code appears more complicated than it needs to be. I abstracted the CPU +testing harness code using generics and traits so I could easily slot in different +kernels and their settings while writing the blog post. + +You could just call the kernel function directly in nested loops. + +::: + +### Tests + +By moving the kernel code to the CPU, I could write tests that ran quickly and entirely +on the CPU. This eliminated the need to serialize tests and offload them to the GPU +(which is a shared and limited resource). + +This approach has several benefits. First, it significantly reduced the feedback loop +during development, allowing me to catch issues faster. Second, it ensured the tests +could be run in any environment where the Rust toolchain is available—no GPU required. +This is especiallly relevant in CI environments such as Github Actions that do not have +a GPU by default. + +For example, my test for a small matrix multiplication kernel running in the harness on +the CPU looked like this: + +import { RustCpuBackendTest } from './snippets/party.tsx'; + + + +### Benchmarks + +I wanted to run benchmarks similar to those in the original blog post. Because I was +using Rust, this was simple. I used +[criterion](https://github.com/bheisler/criterion.rs) with `cargo bench`, just like any +other Rust project. + +This required no new tools or workflows. The tools I already knew worked seamlessly. +More importantly, this approach benefits anyone working on the project. Any Rust +engineer can run these benchmarks with no additional setup--`cargo bench` is a standard +part of the Rust ecosystem. + +### Lint + +Linting GPU code in Rust works the same way as for CPU code. Running `cargo clippy` +highlighted issues and enforced consistent code quality. Any custom lint configurations +are also applied to Rust GPU kernels. Lints ensure that GPU code is held to the same +high standards as the rest of the project. + +### Documentation + +Writing doc comments and running `cargo doc` generates documentation for GPU kernels, +exactly how it happens in regular Rust. While some ecosystems offer similar tools, +Rust's integration is built-in and works seamlessly for both CPU and GPU code. There's +no special setup required. + +## But wait, there's more! + +The kernel in Zach's blog post is intentionally simple. That makes it easy to follow, +but it also means the Rust code looks very similar to WGSL. While this is fine for an +introductory example, it doesn't demonstrate Rust's real strengths for GPU programming. +These strengths—reusing existing libraries, traits, enums, generics, and more—become much more important as projects +grow in complexity. + +### Leverage the existing Rust ecosystem + +Rust's `no_std` ecosystem offers a wide array of libraries that can be used in +environments without the standard library. Traditionally this has meant embedded +devices, but a lot of the same assumptions apply to GPUs! As a consequence, you can +reuse [existing `no_std` & no `alloc` libraries from +crates.io](https://crates.io/categories/no-std::no-alloc) in your GPU code _without the +authors explicitly adding GPU support_. This is uniquely enabled by Rust GPU's +implementation choices and Rust's +[registers](https://without.boats/blog/the-registers-of-rust/). Sharing and reusing code +from the greater Rust ecosystem is a superpower when writing GPU programs that will +massively compound over time. + +### Traits + +Traits are one of Rust's most powerful tools and they work with Rust GPU. Traits let you +define zero-cost reusable type-safe behavior. For example, if you have multiple kernels +for different matrix multiplication strategies, you can define a `MatrixMultiplication` +trait and implement it for each variation. This eliminates duplication and makes your +code easier to extend. + +### Enums and zero-sized types + +GPU code is notoriously hard to read, but Rust's enums and zero-sized types (ZSTs) can +make it much more understandable. Enums let you explicitly encode states or modes. For +example, you can define tiling strategies or precision levels using enums instead of +relying on constants or magic numbers. + +ZSTs take this further by encoding configurations directly into the type system. For +example, you could represent different kernel configurations as ZSTs. This approach +ensures invalid configurations are impossible, improving both readability and safety. + +### Generics + +Generics are another feature missing from this kernel but are a powerful tool in Rust +GPU. They allow you to write flexible kernels that work across different data types or +memory layouts. For instance, you can write a single function that supports both `f32` +and `f64` without duplicating code, all while maintaining type safety and performance. + +### Error handling with `Result` + +Rust GPU also supports error handling using `Result`. Encoding errors in the type system +makes it clear where things can go wrong and forces developers to handle those cases. +This is particularly useful for validating kernel inputs or handling the many edge cases +in GPU logic. + +### Iterators + +Rust's iterators don't appear in this kernel, but they're another way Rust GPU +simplifies complex logic. Instead of manual loops with indices, you can use iterators to +express your logic more clearly. + +Iterators reduce the chance of off-by-one errors and make the intent of the code much +clearer. + +Rust GPU's support for iterators is not complete but we are looking to improve it in the +future. + +### Conditional compilation + +This kernel doesn't use conditional compilation, but it's a key feature of Rust that +works with Rust GPU. With `#[cfg(...)]`, you can adapt kernels to different hardware or +configurations without duplicating code. GPU languages like WGSL or GLSL offer +preprocessor directives, but these tools lack standardization across ecosystems. Rust +GPU leverages the existing Cargo ecosystem, so conditional compilation follows the same +standards all Rust developers already know. This makes adapting kernels for different +targets easier and more maintainable. + +## Come join us! + +Rust GPU only recently became a [community managed +project](/blog/transition-announcement). We're eager to add more users and contributors! +We will be working on revamping the onboarding and documentation soon. To follow along +or get involved, check out the [`rust-gpu` repo on +GitHub](https://github.com/rust-gpu/rust-gpu). +
+ +[^1]: Via [MoltenVK](https://github.com/KhronosGroup/MoltenVK) +[^2]: + Technically `wgpu` translates SPIR-V to GLSL or WGSL via + [naga](https://github.com/gfx-rs/wgpu/tree/trunk/naga) diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx new file mode 100644 index 0000000..90ba96f --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/naive.tsx @@ -0,0 +1,59 @@ +import React from "react"; +import CodeBlock from "@theme/CodeBlock"; +import Snippet from "@site/src/components/Snippet"; +import RustKernelSource from "!!raw-loader!../code/crates/gpu/naive/src/lib.rs"; + +export const WebGpuInputs: React.FC = () => ( + + {`struct Dimensions { + M: u32, + K: u32, + N: u32, +} + +@group(0) @binding(0) var dimensions: Dimensions; +@group(0) @binding(1) var a: array; +@group(0) @binding(2) var b: array; +@group(0) @binding(3) var result: array; +`} + +); + +export const WebGpuKernel: React.FC = () => ( + + {" "} + {`@compute @workgroup_size(1) +fn main(@builtin(global_invocation_id) global_id: vec3) { + let index = global_id.x; + let row = index / dimensions.N; + let col = index % dimensions.N; + + if (index < dimensions.M * dimensions.N) { + var sum = 0.0; + for (var i: u32 = 0u; i < dimensions.K; i = i + 1u) { + sum = sum + a[row * dimensions.K + i] * b[i * dimensions.N + col]; + } + result[row * dimensions.N + col] = sum; + } +} +`} + +); + +export const RustNaiveInputs: React.FC = () => ( + + {RustKernelSource} + +); + +export const RustNaiveWorkgroup: React.FC = () => ( + + {RustKernelSource} + +); diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/party.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/party.tsx new file mode 100644 index 0000000..0f3b99f --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/party.tsx @@ -0,0 +1,54 @@ +import React from "react"; +import Snippet from "@site/src/components/Snippet"; +import RustKernelSource from "!!raw-loader!../code/crates/gpu/tiling_1d/src/lib.rs"; +import RustIsomorphicSource from "!!raw-loader!../code/crates/shared/isomorphic/src/lib.rs"; +import RustIsomorphicCargoToml from "!!raw-loader!../code/crates/shared/isomorphic/Cargo.toml"; +import RustWgpuBackend from "!!raw-loader!../code/crates/cpu/matmul/src/backends/wgpu.rs"; +import RustCpuBackendSource from "!!raw-loader!../code/crates/cpu/matmul/src/backends/cpu.rs"; + +export const RustPartySettings: React.FC = () => ( + + {RustKernelSource} + +); + +export const RustIsomorphic: React.FC = () => ( + + {RustIsomorphicSource} + +); + +export const RustIsomorphicGlam: React.FC = () => ( + + {RustIsomorphicSource} + +); + +export const RustIsomorphicDeps: React.FC = () => ( + + {RustIsomorphicCargoToml} + +); + +export const RustWgpuDimensions: React.FC = () => ( + + {RustWgpuBackend} + +); + +export const RustCpuBackendHarness: React.FC = () => ( + + {RustCpuBackendSource} + +); + +export const RustCpuBackendTest: React.FC = () => ( + + {RustCpuBackendSource} + +); diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_1d.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_1d.tsx new file mode 100644 index 0000000..eb76f13 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_1d.tsx @@ -0,0 +1,14 @@ +import React from "react"; +import CodeBlock from "@theme/CodeBlock"; +import Snippet from "@site/src/components/Snippet"; +import RustKernelSource from "!!raw-loader!../code/crates/gpu/tiling_1d/src/lib.rs"; + +export const RustTiling1d: React.FC = () => ( + + {RustKernelSource} + +); diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_2d_simd.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_2d_simd.tsx new file mode 100644 index 0000000..9732d6b --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_2d_simd.tsx @@ -0,0 +1,9 @@ +import React from "react"; +import Snippet from "@site/src/components/Snippet"; +import RustKernelSource from "!!raw-loader!../code/crates/gpu/tiling_2d_simd/src/lib.rs"; + +export const RustTiling2dSimd: React.FC = () => ( + + {RustKernelSource} + + ); diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/workgroup_256.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/workgroup_256.tsx new file mode 100644 index 0000000..f47f5cc --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/workgroup_256.tsx @@ -0,0 +1,34 @@ +import React from "react"; +import Snippet from "@site/src/components/Snippet"; +import RustKernelSource from "!!raw-loader!../code/crates/gpu/workgroup_256/src/lib.rs"; +import VariantsSource from "!!raw-loader!../code/crates/cpu/matmul/src/variants.rs"; +import WgpuBackendSource from "!!raw-loader!../code/crates/cpu/matmul/src/backends/wgpu.rs"; + +export const RustWorkgroup256Workgroup: React.FC = () => ( + + {RustKernelSource} + +); + +export const RustWorkgroup256WorkgroupCount: React.FC = () => ( + + {VariantsSource} + +); + +export const RustWorkgroup256WgpuDispatch: React.FC = () => ( + + {WgpuBackendSource} + +); diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/workgroup_2d.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/workgroup_2d.tsx new file mode 100644 index 0000000..39ebf10 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/workgroup_2d.tsx @@ -0,0 +1,58 @@ +import React from "react"; +import CodeBlock from "@theme/CodeBlock"; +import Snippet from "@site/src/components/Snippet"; +import RustKernelSource from "!!raw-loader!../code/crates/gpu/workgroup_2d/src/lib.rs"; +import VariantsSource from "!!raw-loader!../code/crates/cpu/matmul/src/variants.rs"; +import WgpuBackendSource from "!!raw-loader!../code/crates/cpu/matmul/src/backends/wgpu.rs"; + +export const RustWorkgroup2d: React.FC = () => ( + + {RustKernelSource} + +); + +/* +export const RustWorkgroup2d: React.FC = () => ( + + {RustKernelSource} + +); +*/ + +export const RustWorkgroup2dWorkgroup: React.FC = () => ( + + {RustKernelSource} + +); + +export const RustWorkgroup2dWorkgroupCount: React.FC = () => ( + + {VariantsSource} + +); + +export const RustWorkgroup2dWgpuDispatch: React.FC = () => ( + + {WgpuBackendSource} + +); diff --git a/blog/tags.yml b/blog/tags.yml index c1dd211..966e61c 100644 --- a/blog/tags.yml +++ b/blog/tags.yml @@ -17,3 +17,13 @@ demo: label: demo permalink: /demos description: Demos + +code: + label: code + permalink: /code + description: Posts that include code + +performance: + label: performance + permalink: /performance + description: Performance measuring and tuning diff --git a/docusaurus.config.ts b/docusaurus.config.ts index e99a653..b01583d 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -74,7 +74,11 @@ const config: Config = { // src: 'img/logo.svg', // }, items: [ - { href: "https://rust-gpu.github.io/rust-gpu/book/", label: "Docs", position: "right" }, + { + href: "https://rust-gpu.github.io/rust-gpu/book/", + label: "Docs", + position: "right", + }, { to: "/blog", label: "Blog", position: "right" }, { to: "/ecosystem", label: "Ecosystem", position: "right" }, { to: "/changelog", label: "Changelog", position: "right" }, @@ -89,6 +93,14 @@ const config: Config = { theme: githubDark, darkTheme: githubDark, defaultLanguage: "rust", + additionalLanguages: ["wgsl"], + magicComments: [ + { + className: "theme-code-line", + line: "highlight-next-line", + block: { start: "highlight-start", end: "highlight-end" }, + }, + ], }, colorMode: { disableSwitch: true, diff --git a/package.json b/package.json index bb9fcd4..dfb8581 100644 --- a/package.json +++ b/package.json @@ -21,9 +21,12 @@ "@mdx-js/react": "^3.0.0", "clsx": "^2.0.0", "prism-react-renderer": "^2.3.0", + "raw-loader": "^4.0.2", "react": "^18.0.0", + "react-compare-slider": "^3.1.0", "react-dom": "^18.0.0", - "react-icons": "5.3.0" + "react-icons": "5.3.0", + "yaml": "^2.6.0" }, "devDependencies": { "@docusaurus/module-type-aliases": "3.5.2", diff --git a/src/components/CodeCompare/index.tsx b/src/components/CodeCompare/index.tsx new file mode 100644 index 0000000..90985c3 --- /dev/null +++ b/src/components/CodeCompare/index.tsx @@ -0,0 +1,48 @@ +import React from "react"; +import { ReactCompareSlider } from "react-compare-slider"; + +interface CodeCompareProps { + leftContent: React.ReactNode; // React component or JSX for the left side + rightContent: React.ReactNode; // React component or JSX for the right side + leftHeader?: React.ReactNode; // Optional React component or JSX for the left header + rightHeader?: React.ReactNode; // Optional React component or JSX for the right header +} + +const CodeCompare: React.FC = ({ + leftContent, + rightContent, + leftHeader, + rightHeader, +}) => { + return ( + + {leftHeader && ( +
+ {leftHeader} +
+ )} +
+ {leftContent} +
+ + } + itemTwo={ +
+ {rightHeader && ( +
+ {rightHeader} +
+ )} +
+ {rightContent} +
+
+ } + className="h-full w-full" + /> + ); +}; + +export default CodeCompare; diff --git a/src/components/Snippet/index.tsx b/src/components/Snippet/index.tsx new file mode 100644 index 0000000..3cc64a7 --- /dev/null +++ b/src/components/Snippet/index.tsx @@ -0,0 +1,74 @@ +import React from "react"; +import CodeBlock from "@theme/CodeBlock"; + +interface SnippetProps extends React.ComponentProps { + /** + * A metadata string for specifying lines to include, e.g., "1-3,5,8-10". + */ + lines?: string; + omitted_placeholder?: string; + strip_leading_spaces?: boolean; +} + +/** + * A component for rendering a snippet of code, optionally filtering lines, + * showing ellipses for omissions, and stripping all leading spaces. + */ +const Snippet: React.FC = ({ + children, + lines, + omitted_placeholder = "...", + strip_leading_spaces = false, + ...props +}) => { + if (typeof children !== "string") { + console.error( + "Snippet expects children to be a string containing the file content." + ); + return null; + } + + // Parse the `linesToInclude` metadata string into an array of line numbers. + const parseLineRanges = (metaString?: string): number[] => { + if (!metaString) return []; + return metaString.split(",").flatMap((range) => { + const [start, end] = range.split("-").map(Number); + if (!end) return [start]; // Single line + return Array.from({ length: end - start + 1 }, (_, i) => start + i); // Range + }); + }; + + const includedLines = parseLineRanges(lines); + + // Extract the lines to include and insert "..." for omissions. + const formatContent = (content: string, lines: number[]): string => { + const allLines = content.split("\n"); + if (lines.length === 0) return content; // If no specific lines are specified, return full content. + + const includedContent: string[] = []; + lines.forEach((line, index) => { + if (index > 0 && lines[index - 1] < line - 1) { + includedContent.push(omitted_placeholder); // Add placeholder for omitted lines + } + + const rawLine = allLines[line - 1] || ""; + const formattedLine = strip_leading_spaces + ? rawLine.trimStart() + : rawLine; + includedContent.push(formattedLine); + }); + + // Add placeholder if lines at the end are omitted + if (lines[lines.length - 1] < allLines.length) { + includedContent.push(omitted_placeholder); + } + + return includedContent.join("\n"); + }; + + const formattedContent = formatContent(children, includedLines); + + return {formattedContent}; +}; + +export default Snippet; diff --git a/src/css/custom.css b/src/css/custom.css index 4ce2a2b..1bfca2b 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -130,6 +130,10 @@ article header > h2 > a { @apply text-xl lg:text-3xl !important; } + + article code { + @apply text-xs !important; + } } @layer lists { diff --git a/yarn.lock b/yarn.lock index 9f122da..e203e33 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6991,6 +6991,14 @@ raw-body@2.5.2: iconv-lite "0.4.24" unpipe "1.0.0" +raw-loader@^4.0.2: + version "4.0.2" + resolved "https://registry.yarnpkg.com/raw-loader/-/raw-loader-4.0.2.tgz#1aac6b7d1ad1501e66efdac1522c73e59a584eb6" + integrity sha512-ZnScIV3ag9A4wPX/ZayxL/jZH+euYb6FcUinPcgiQW0+UBtEv0O6Q3lGd3cqJ+GHH+rksEv3Pj99oxJ3u3VIKA== + dependencies: + loader-utils "^2.0.0" + schema-utils "^3.0.0" + rc@1.2.8: version "1.2.8" resolved "https://registry.yarnpkg.com/rc/-/rc-1.2.8.tgz#cd924bf5200a075b83c188cd6b9e211b7fc0d3ed" @@ -7001,6 +7009,11 @@ rc@1.2.8: minimist "^1.2.0" strip-json-comments "~2.0.1" +react-compare-slider@^3.1.0: + version "3.1.0" + resolved "https://registry.yarnpkg.com/react-compare-slider/-/react-compare-slider-3.1.0.tgz#a4cf1c4db2203d894a7f05b76310f468db2d0b8f" + integrity sha512-TQVbZYmYyTIeKRmQciVXCmUwHjTThQTON7GfWfzMAOInRRG9tCiQnVXnCUd5DJ5l3Hngh4IEzOb9TG82gjoEhQ== + react-dev-utils@^12.0.1: version "12.0.1" resolved "https://registry.yarnpkg.com/react-dev-utils/-/react-dev-utils-12.0.1.tgz#ba92edb4a1f379bd46ccd6bcd4e7bc398df33e73" @@ -8609,6 +8622,11 @@ yaml@^2.3.4: resolved "https://registry.yarnpkg.com/yaml/-/yaml-2.5.1.tgz#c9772aacf62cb7494a95b0c4f1fb065b563db130" integrity sha512-bLQOjaX/ADgQ20isPJRvF0iRUHIxVhYvr53Of7wGcWlO2jvtUlH5m87DsmulFVxRpNLOnI4tB6p/oh8D7kpn9Q== +yaml@^2.6.0: + version "2.6.0" + resolved "https://registry.yarnpkg.com/yaml/-/yaml-2.6.0.tgz#14059ad9d0b1680d0f04d3a60fe00f3a857303c3" + integrity sha512-a6ae//JvKDEra2kdi1qzCyrJW/WZCgFi8ydDV+eXExl95t+5R+ijnqHJbz9tmMh8FUjx3iv2fCQ4dclAQlO2UQ== + yocto-queue@^0.1.0: version "0.1.0" resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-0.1.0.tgz#0294eb3dee05028d31ee1a5fa2c556a6aaf10a1b"