mshahneo
diff --git a/‎mlir/include/mlir/Dialect/XeGPU/Utils/IntelGpuXe2.h renamed to ‎mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
Lines changed: 26 additions & 25 deletions b/‎mlir/include/mlir/Dialect/XeGPU/Utils/IntelGpuXe2.h renamed to ‎mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
Lines changed: 26 additions & 25 deletions
diff --git a/‎mlir/include/mlir/Dialect/XeGPU/Utils/uArch.h renamed to ‎mlir/include/mlir/Dialect/XeGPU/uArch/uArch.h
Lines changed: 68 additions & 59 deletions b/‎mlir/include/mlir/Dialect/XeGPU/Utils/uArch.h renamed to ‎mlir/include/mlir/Dialect/XeGPU/uArch/uArch.h
Lines changed: 68 additions & 59 deletions
diff --git a/‎mlir/lib/Dialect/XeGPU/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎mlir/lib/Dialect/XeGPU/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
@@ -11,10 +11,10 @@
 ///
 //
 //===----------------------------------------------------------------------===//
-#ifndef MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_PVC_H
-#define MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_PVC_H
+#ifndef MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H
+#define MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H
 
-#include "mlir/Dialect/XeGPU/Utils/uArch.h"
+#include "mlir/Dialect/XeGPU/uArch/uArch.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
 #include <map>
@@ -26,14 +26,14 @@ namespace xegpu {
 namespace uArch {
 namespace Xe2Plus {
 struct XeCoreInfo {
-  uint num_threads;
+  uint32_t num_threads;
   SharedMemory shared_memory;
-  uint num_vector_units;
-  uint num_matrix_units;
+  uint32_t num_vector_units;
+  uint32_t num_matrix_units;
 
   // Constructor
-  XeCoreInfo(uint num_threads, const SharedMemory &shared_memory,
-             uint num_vector_units, uint num_matrix_units)
+  XeCoreInfo(uint32_t num_threads, const SharedMemory &shared_memory,
+             uint32_t num_vector_units, uint32_t num_matrix_units)
       : num_threads(num_threads), shared_memory(shared_memory),
         num_vector_units(num_vector_units), num_matrix_units(num_matrix_units) {
   }
@@ -58,7 +58,7 @@ struct DPASInstruction : public Instruction, public MatrixOpInterface {
   // Range systolic_depth;
   // Range repreat_count;
   // Range execution_size;
-  // std::map<std::string, uint> ops_per_channel;
+  // std::map<std::string, uint32_t> ops_per_channel;
   // std::vector<std::vector<std::string>> supported_types;
   // std::map<std::string, std::map<std::string, std::vector<std::string>>>
   //     matrix_size;
@@ -80,34 +80,34 @@ struct DPASInstruction : public Instruction, public MatrixOpInterface {
   virtual bool checkSupportedMMATypes(mlir::Type AType, mlir::Type BType,
                                       mlir::Type CType,
                                       mlir::Type DType) override;
-  virtual std::vector<uint> getSupportedM(mlir::Type type) override;
-  virtual std::vector<uint> getSupportedK(mlir::Type type) override;
-  virtual std::vector<uint> getSupportedN(mlir::Type type) override;
+  virtual std::vector<uint32_t> getSupportedM(mlir::Type type) override;
+  virtual std::vector<uint32_t> getSupportedK(mlir::Type type) override;
+  virtual std::vector<uint32_t> getSupportedN(mlir::Type type) override;
   virtual std::vector<std::pair<unsigned, unsigned>>
   getSupportedMatrix(mlir::Type type, MatrixType matrixType) override;
 };
 
 struct LoadStore2DTileInfo : public RangeTile {
-  std::vector<uint> array_len;
+  std::vector<uint32_t> array_len;
 };
 
 // struct to represent Load2D/Store2D/Prefetch instruction
 struct LoadStorePrefetch2DInstruction : public Instruction {
   MemoryType memory_type;
   MemoryAccessType memory_access_type;
   //   std::vector<std::string> supported_types;
-  std::vector<uint> supported_types_bitwidth;
-  std::map<std::string, uint> alignment;
+  std::vector<uint32_t> supported_types_bitwidth;
+  std::map<std::string, uint32_t> alignment;
   LoadStore2DTileInfo supported_tile_sizes;
-  uint min_surface_pitch;
+  uint32_t min_surface_pitch;
 
   // Validate Array length restriction on a given tile
-  bool validateArrayLenRestriction(Tile tile, uint array_len,
+  bool validateArrayLenRestriction(Tile tile, uint32_t array_len,
                                    mlir::Type dataType) {
 
-    Restriction<Tile, uint, mlir::Type> width_array_len_restriction(
+    Restriction<Tile, uint32_t, mlir::Type> width_array_len_restriction(
         tile, array_len, dataType,
-        [](Tile tile, uint array_len, mlir::Type dataType) {
+        [](Tile tile, uint32_t array_len, mlir::Type dataType) {
           assert(tile.no_of_dims == 2);
           return tile.dims[1] * array_len *
                      (dataType.getIntOrFloatBitWidth() / 8) <=
@@ -118,9 +118,9 @@ struct LoadStorePrefetch2DInstruction : public Instruction {
 
   // Validate Surface Pitch restriction on a given tile
   bool validateSurfacePitchRestriction(Tile tile,
-                                       uint surfacePitch /*in bytes*/) {
-    Restriction<Tile, uint> surface_pitch_restriction(
-        tile, surfacePitch, [](Tile tile, uint surfacePitch) {
+                                       uint32_t surfacePitch /*in bytes*/) {
+    Restriction<Tile, uint32_t> surface_pitch_restriction(
+        tile, surfacePitch, [](Tile tile, uint32_t surfacePitch) {
           assert(tile.no_of_dims == 2);
           return surfacePitch >= 64;
         });
@@ -149,13 +149,14 @@ struct PVCuArch : public Xe2Plus {
     this->uArch_hierarchy.push_back(uArchHierarchyComponent("gpu", 2));
     // Intialize register file info
     // GRF
-    this->register_file_info["GRF"] =
+    this->register_file_info.emplace(
+        "GRF",
         RegisterFileInfo(64 * 1024,          // size in bits
                          {"small", "large"}, // GRF modes
                          {128, 256},         // registers per thread per mode
                          0,                  // number of banks
                          0                   // bank size
-        );
+                         ));
     // Initialize cache info
     // L1 cache, XeCore level
     this->cache_info.push_back(
@@ -221,4 +222,4 @@ struct BMGuArch : public Xe2Plus {
 } // namespace xegpu
 } // namespace mlir
 
-#endif // MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_PVC_H
+#endif // MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H
@@ -14,12 +14,16 @@
 #ifndef MLIR_DIALECT_XEGPU_UTILS_UARCH_H
 #define MLIR_DIALECT_XEGPU_UTILS_UARCH_H
 
+#include <any>
 #include <functional>
 #include <iostream>
+#include <map>
 #include <mutex>
 #include <shared_mutex>
 #include <tuple>
 
+#include "mlir/IR/Types.h"
+
 namespace mlir {
 namespace xegpu {
 namespace uArch {
@@ -37,8 +41,8 @@ struct Range {
 //   dim: [2, 2]
 // This represents a 2x2 tile
 struct Tile {
-  uint no_of_dims;
-  std::vector<uint> dims;
+  uint32_t no_of_dims;
+  std::vector<uint32_t> dims;
 };
 
 // RangeTile represents a range of tiles instead of a single tile
@@ -52,7 +56,7 @@ struct Tile {
 // This represents a 2x2 RangeTile where the first dimension can have values
 // from 1 to 32 and the second dimension can have values from 2 to 16
 struct RangeTile {
-  uint no_of_dims;
+  uint32_t no_of_dims;
   std::vector<Range> dims;
 };
 
@@ -68,8 +72,8 @@ struct RangeTile {
 // This represents a 2x2 DiscreteTile where the first dimension can have values
 // 1, 2, 4, 8, 16, 32 and the second dimension can have values 2, 4, 8, 16
 struct DiscreteTile {
-  uint no_of_dims;
-  std::vector<std::vector<uint>> dims;
+  uint32_t no_of_dims;
+  std::vector<std::vector<uint32_t>> dims;
 };
 
 // Restriction struct
@@ -93,9 +97,9 @@ struct DiscreteTile {
 template <typename... Args>
 struct Restriction {
   std::tuple<Args...> data;
-  std::function<void(Args...)> func;
+  std::function<bool(Args...)> func;
 
-  Restriction(Args... args, std::function<void(Args...)> f)
+  Restriction(Args... args, std::function<bool(Args...)> f)
       : data(args...), func(f) {}
 
   bool validate() { return std::apply(func, data); }
@@ -107,9 +111,9 @@ struct uArchHierarchyComponent {
   std::string name = ""; // optional name of the hierarchy component
   // no. of lower hierarchy component it contains, e.g., for PVC XeCore it
   // contains 8 threads, so no_of_component=8
-  uint no_of_component;
+  uint32_t no_of_component;
   // Constructor
-  uArchHierarchyComponent(const std::string &name, uint no_of_component)
+  uArchHierarchyComponent(const std::string &name, uint32_t no_of_component)
       : name(name), no_of_component(no_of_component) {}
 };
 
@@ -203,35 +207,37 @@ struct Instruction {
 
 // A struct to represent register file information
 struct RegisterFileInfo {
-  uint size;                     // size per register in bits
+  uint32_t size;                 // size per register in bits
   std::vector<std::string> mode; // e.g., "small", "large" GRF modes
-  std::vector<uint>
+  std::vector<uint32_t>
       num_regs_per_thread_per_mode; // number of registers per thread per mode
-  uint num_banks;
-  uint bank_size;
+  uint32_t num_banks;
+  uint32_t bank_size;
 
   // Constructor
-  RegisterFileInfo(uint size, const std::vector<std::string> &mode,
-                   const std::vector<uint> &numRegs, uint num_banks,
-                   uint bank_size)
+  RegisterFileInfo() = default;
+  RegisterFileInfo(uint32_t size, const std::vector<std::string> &mode,
+                   const std::vector<uint32_t> &numRegs, uint32_t num_banks,
+                   uint32_t bank_size)
       : size(size), mode(mode), num_regs_per_thread_per_mode(numRegs),
         num_banks(num_banks), bank_size(bank_size) {}
 };
 
 // A struct to represent cache information
 struct CacheInfo {
-  uint size;
-  uint line_size;
+  uint32_t size;
+  uint32_t line_size;
   // At which component level the cache is shared
   uArchHierarchyComponent component;
-  // uint associativity;
-  // uint num_banks;
-  // uint bank_size;
-  // uint num_ports;
-  // uint port_width;
-  // uint bank_conflicts;
+  // uint32_t associativity;
+  // uint32_t num_banks;
+  // uint32_t bank_size;
+  // uint32_t num_ports;
+  // uint32_t port_width;
+  // uint32_t bank_conflicts;
   // Constructor
-  CacheInfo(uint size, uint line_size, const uArchHierarchyComponent &component)
+  CacheInfo(uint32_t size, uint32_t line_size,
+            const uArchHierarchyComponent &component)
       : size(size), line_size(line_size), component(component) {}
 };
 
@@ -274,6 +280,7 @@ struct uArch {
   std::vector<Restriction<> *> restrictions;
 
   // Constructor
+  uArch() = default;
   uArch(const std::string &name, const std::string &description,
         const std::vector<uArchHierarchyComponent> &uArch_hierarchy = {},
         const std::map<std::string, RegisterFileInfo> &register_file_info = {},
@@ -287,48 +294,49 @@ struct uArch {
 
 // A struct to represent shared memory information
 struct SharedMemory {
-  uint size;      // in bytes
-  uint alignment; // in bytes
+  uint32_t size;      // in bytes
+  uint32_t alignment; // in bytes
   // @TODO: Add more fields as needed
-  // uint latency;
-  // uint throughput;
-  // uint bandwidth;
-  // uint num_ports;
-  // uint port_width;
-  // uint bank_size;
-  // uint bank_conflicts;
-  // uint num_banks;
+  // uint32_t latency;
+  // uint32_t throughput;
+  // uint32_t bandwidth;
+  // uint32_t num_ports;
+  // uint32_t port_width;
+  // uint32_t bank_size;
+  // uint32_t bank_conflicts;
+  // uint32_t num_banks;
 
   // Constructor
-  SharedMemory(uint size, uint alignment) : size(size), alignment(alignment) {}
+  SharedMemory(uint32_t size, uint32_t alignment)
+      : size(size), alignment(alignment) {}
 };
 
 // For future use case in Xe4+
 
 // struct EUInfo {
-//     uint num_eu_threads;
+//     uint32_t num_eu_threads;
 //     SharedMemory shared_memory;
 // };
 
-//     uint num_simd_units;
-//     uint num_spus;
-//     uint num_smt;
-//     uint num_hardware_threads;
-//     uint num_threads_per_spu;
-//     uint num_threads_per_simd_unit;
-//     uint num_threads_per_hardware_thread;
-//     uint num_threads_per_smt;
+//     uint32_t num_simd_units;
+//     uint32_t num_spus;
+//     uint32_t num_smt;
+//     uint32_t num_hardware_threads;
+//     uint32_t num_threads_per_spu;
+//     uint32_t num_threads_per_simd_unit;
+//     uint32_t num_threads_per_hardware_thread;
+//     uint32_t num_threads_per_smt;
 //     SharedMemory shared_memory;
 // };
 
 // A struct to represent a GPU uArch
 // This struct is used to represent the GPU microarchitecture of a target device
 // struct GPUuArch : public uArch {
-//     uint num_compute_units;
-//     uint num_vector_units;
-//     uint num_scalar_units;
-//     uint num_tensor_units;
-//     uint num_matrix_units;
+//     uint32_t num_compute_units;
+//     uint32_t num_vector_units;
+//     uint32_t num_scalar_units;
+//     uint32_t num_tensor_units;
+//     uint32_t num_matrix_units;
 //     SharedMemory shared_memory;
 // };
 
@@ -346,17 +354,17 @@ struct TileOpInterface {
   // @param surface_pitch, suface pitch
   // @param array_len, array length
   virtual bool validate(Tile tile, Tile surface, mlir::Type dataType,
-                        uint surface_pitch, uint array_len = 1) = 0;
+                        uint32_t surface_pitch, uint32_t array_len = 1) = 0;
   virtual ~TileOpInterface() = default;
 };
 
 enum class MatrixType { MatrixA, MatrixB, MatrixC, MatrixD };
 struct MatrixOpInterface {
   virtual bool checkSupportedMMATypes(mlir::Type AType, mlir::Type BType,
                                       mlir::Type CType, mlir::Type DType) = 0;
-  virtual std::vector<uint> getSupportedM(mlir::Type type) = 0;
-  virtual std::vector<uint> getSupportedK(mlir::Type type) = 0;
-  virtual std::vector<uint> getSupportedN(mlir::Type type) = 0;
+  virtual std::vector<uint32_t> getSupportedM(mlir::Type type) = 0;
+  virtual std::vector<uint32_t> getSupportedK(mlir::Type type) = 0;
+  virtual std::vector<uint32_t> getSupportedN(mlir::Type type) = 0;
   virtual std::vector<std::pair<unsigned, unsigned>>
   getSupportedMatrix(mlir::Type type, MatrixType matrixType) = 0;
 
@@ -373,13 +381,14 @@ struct uArchMap {
 
   // Insert or update a key-value pair
   void insert(const std::string &key, uArch value) {
-    std::unique_lock lock(mutex_);
-    map_[key] = value;
+    std::unique_lock<std::shared_mutex> lock(mutex_);
+    // map_[key] = value;
+    map_.emplace(key, value);
   }
 
   // Get a value by key (concurrent safe read)
   std::optional<uArch> get(const std::string &key) const {
-    std::shared_lock lock(mutex_);
+    std::shared_lock<std::shared_mutex> lock(mutex_);
     auto it = map_.find(key);
     if (it != map_.end())
       return it->second;
@@ -388,13 +397,13 @@ struct uArchMap {
 
   // Check if a key exists
   bool contains(const std::string &key) const {
-    std::shared_lock lock(mutex_);
+    std::shared_lock<std::shared_mutex> lock(mutex_);
     return map_.find(key) != map_.end();
   }
 
   // Remove a key
   bool erase(const std::string &key) {
-    std::unique_lock lock(mutex_);
+    std::unique_lock<std::shared_mutex> lock(mutex_);
     return map_.erase(key) > 0;
   }
 
 
@@ -1,3 +1,4 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(uArch)
 add_subdirectory(Utils)