-
Notifications
You must be signed in to change notification settings - Fork 2
[XeGPU] uArch definition (PR 1/N) #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
988bf66
f853e5a
44ea3a4
6c45d97
ffd9d69
af7098b
6a61603
9d4dfca
78677f4
b05de85
f4e3372
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Am I correct that this file is supposed to replace current https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potentially, yes. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
//===--- uArch.h ---------------------------------------*- C++ -*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
/// \file | ||
/// PVC uArch definition. | ||
/// | ||
// | ||
//===----------------------------------------------------------------------===// | ||
#ifndef MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H | ||
#define MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H | ||
|
||
#include "mlir/Dialect/XeGPU/uArch/uArch.h" | ||
#include "mlir/IR/BuiltinTypes.h" | ||
#include "mlir/IR/TypeUtilities.h" | ||
#include <map> | ||
#include <string> | ||
#include <vector> | ||
|
||
namespace mlir { | ||
namespace xegpu { | ||
namespace uArch { | ||
namespace Xe2Plus { | ||
struct XeCoreInfo { | ||
uint32_t num_threads; | ||
SharedMemory shared_memory; | ||
uint32_t num_vector_units; | ||
uint32_t num_matrix_units; | ||
|
||
// Constructor | ||
XeCoreInfo(uint32_t num_threads, const SharedMemory &shared_memory, | ||
uint32_t num_vector_units, uint32_t num_matrix_units) | ||
: num_threads(num_threads), shared_memory(shared_memory), | ||
num_vector_units(num_vector_units), num_matrix_units(num_matrix_units) { | ||
} | ||
}; | ||
|
||
struct Xe2Plus : public uArch { | ||
XeCoreInfo xe_core; | ||
Xe2Plus( | ||
const std::string &archName, const std::string &archDescription, | ||
const XeCoreInfo &xeCore, | ||
const std::vector<uArchHierarchyComponent> &hierarchy = {}, | ||
const std::map<std::string, RegisterFileInfo> ®Info = {}, | ||
const std::vector<CacheInfo> &cacheInfo = {}, | ||
const std::map<std::string, std::shared_ptr<Instruction>> &instrs = {}, | ||
const std::vector<Restriction<> *> &restrs = {}) | ||
: uArch(archName, archDescription, hierarchy, regInfo, cacheInfo, instrs, | ||
restrs), | ||
xe_core(xeCore) {} | ||
}; | ||
|
||
// struct to represent DPAS instruction | ||
struct DPASInstruction : public Instruction, public MatrixOpInterface { | ||
// Range systolic_depth; | ||
// Range repreat_count; | ||
// Range execution_size; | ||
// std::map<std::string, uint32_t> ops_per_channel; | ||
// std::vector<std::vector<std::string>> supported_types; | ||
// std::map<std::string, std::map<std::string, std::vector<std::string>>> | ||
// matrix_size; | ||
|
||
// bool checkSupportedDPASTypes(mlir::Type dstType, mlir::Type src0Type, | ||
// mlir::Type src1Type, mlir::Type src2Type); | ||
|
||
DPASInstruction() | ||
: Instruction("dpas", // name | ||
"Dot Product Accumulate", // description | ||
"0xABCD", // opcode | ||
FunctionalUnit::Matrix, // functional_unit | ||
InstructionType::SIMD, // type | ||
InstructionScope::Subgroup, // scope | ||
UnitOfComputation::Matrix) // unit_of_computation | ||
{} | ||
|
||
// Override all virtuals from MatrixOpInterface | ||
virtual bool checkSupportedMMATypes(mlir::Type AType, mlir::Type BType, | ||
mlir::Type CType, | ||
mlir::Type DType) override; | ||
virtual std::vector<uint32_t> getSupportedM(mlir::Type type) override; | ||
virtual std::vector<uint32_t> getSupportedK(mlir::Type type) override; | ||
virtual std::vector<uint32_t> getSupportedN(mlir::Type type) override; | ||
virtual std::vector<std::pair<unsigned, unsigned>> | ||
getSupportedMatrix(mlir::Type type, MatrixType matrixType) override; | ||
}; | ||
|
||
struct LoadStore2DTileInfo : public RangeTile { | ||
std::vector<uint32_t> array_len; | ||
}; | ||
|
||
// struct to represent Load2D/Store2D/Prefetch instruction | ||
struct LoadStorePrefetch2DInstruction : public Instruction { | ||
MemoryType memory_type; | ||
MemoryAccessType memory_access_type; | ||
// std::vector<std::string> supported_types; | ||
std::vector<uint32_t> supported_types_bitwidth; | ||
std::map<std::string, uint32_t> alignment; | ||
LoadStore2DTileInfo supported_tile_sizes; | ||
uint32_t min_surface_pitch; | ||
|
||
// Validate Array length restriction on a given tile | ||
bool validateArrayLenRestriction(Tile tile, uint32_t array_len, | ||
mlir::Type dataType) { | ||
|
||
Restriction<Tile, uint32_t, mlir::Type> width_array_len_restriction( | ||
tile, array_len, dataType, | ||
[](Tile tile, uint32_t array_len, mlir::Type dataType) { | ||
assert(tile.no_of_dims == 2); | ||
return tile.dims[1] * array_len * | ||
(dataType.getIntOrFloatBitWidth() / 8) <= | ||
64; | ||
}); | ||
return width_array_len_restriction.validate(); | ||
} | ||
|
||
// Validate Surface Pitch restriction on a given tile | ||
bool validateSurfacePitchRestriction(Tile tile, | ||
uint32_t surfacePitch /*in bytes*/) { | ||
Restriction<Tile, uint32_t> surface_pitch_restriction( | ||
tile, surfacePitch, [](Tile tile, uint32_t surfacePitch) { | ||
assert(tile.no_of_dims == 2); | ||
return surfacePitch >= 64; | ||
}); | ||
return surface_pitch_restriction.validate(); | ||
} | ||
}; | ||
|
||
namespace PVCuArch { | ||
struct PVCuArch : public Xe2Plus { | ||
// Maintaines ownership of the instructions owned by PVUarch | ||
std::vector<std::shared_ptr<Instruction>> owned_instructions; | ||
PVCuArch() | ||
: Xe2Plus("pvc", // archName | ||
"Ponte Vecchio Architecture", // archDescription | ||
XeCoreInfo(8, SharedMemory(512 * 1024, 4), 8, 8), // xeCore | ||
{/* register_file_info */}, // Optional: empty | ||
{/* cache_info */}, // Optional: empty | ||
{/* instructions */}, // Optional: empty | ||
{/* restrictions */} // Optional: empty | ||
) { | ||
// Initialize uArchHierarchy | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("thread", 0)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeCore", 8)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeSlice", 16)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeStack", 4)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("gpu", 2)); | ||
// Intialize register file info | ||
// GRF | ||
this->register_file_info.emplace( | ||
"GRF", | ||
RegisterFileInfo(64 * 1024, // size in bits | ||
{"small", "large"}, // GRF modes | ||
{128, 256}, // registers per thread per mode | ||
0, // number of banks | ||
0 // bank size | ||
)); | ||
// Initialize cache info | ||
// L1 cache, XeCore level | ||
this->cache_info.push_back( | ||
CacheInfo(512 * 1024, 64, this->uArch_hierarchy[1])); | ||
// L3 cache, XeStack level | ||
this->cache_info.push_back( | ||
CacheInfo(512 * 1024, 64, this->uArch_hierarchy[3])); | ||
|
||
// Add the instructions | ||
auto dpas = std::make_shared<DPASInstruction>(); | ||
instructions.emplace(dpas->name, dpas); | ||
// instructions[dpas->name] = dpas.get(); | ||
owned_instructions.push_back(dpas); | ||
} | ||
}; | ||
} // namespace PVCuArch | ||
|
||
namespace BMGuArch { | ||
struct BMGuArch : public Xe2Plus { | ||
// Maintaines ownership of the instructions owned by PVUarch | ||
std::vector<std::shared_ptr<Instruction>> owned_instructions; | ||
BMGuArch() | ||
: Xe2Plus("bmg", // archName | ||
"Battlemage Architecture", // archDescription | ||
XeCoreInfo(8, SharedMemory(256 * 1024, 4), 8, 8), // xeCore | ||
{/* register_file_info */}, // Optional: empty | ||
{/* cache_info */}, // Optional: empty | ||
{/* instructions */}, // Optional: empty | ||
{/* restrictions */} // Optional: empty | ||
) { | ||
// Initialize uArchHierarchy | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("thread", 0)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeCore", 8)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeSlice", 4)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeStack", 5)); | ||
this->uArch_hierarchy.push_back(uArchHierarchyComponent("gpu", 1)); | ||
// Intialize register file info | ||
// GRF | ||
this->register_file_info["GRF"] = | ||
RegisterFileInfo(64 * 1024, // size in bits | ||
{"small", "large"}, // GRF modes | ||
{128, 256}, // registers per thread per mode | ||
0, // number of banks | ||
0 // bank size | ||
); | ||
// Initialize cache info | ||
// L1 cache, XeCore level | ||
this->cache_info.push_back( | ||
CacheInfo(256 * 1024, 64, this->uArch_hierarchy[1])); | ||
// L3 cache, XeStack level | ||
this->cache_info.push_back( | ||
CacheInfo(18 * 1024 * 1024, 256, this->uArch_hierarchy[3])); | ||
|
||
// Add the instructions | ||
auto dpas = std::make_shared<DPASInstruction>(); | ||
instructions.emplace(dpas->name, dpas); | ||
// instructions[dpas->name] = dpas.get(); | ||
owned_instructions.push_back(dpas); | ||
} | ||
}; | ||
} // namespace BMGuArch | ||
|
||
} // namespace Xe2Plus | ||
} // namespace uArch | ||
} // namespace xegpu | ||
} // namespace mlir | ||
|
||
#endif // MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to raise the following discussion points:
GPUModuleOp
, a finer granularity than us. It may be a requirement of GPU dialect. A potential benefit is that they may be able to support the case that a ModuleOp containing multiple GPUModuleOp, with each GPUMoudleOp map to different GPUs from different vendors.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sang Ik created this PR: llvm#147372. is it similar to the pass here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, this appears similar to the xevm's attach target pass that is yet to be merged. After the merge, the
chip
string of the xevm target should be appropriate to query uarch info:Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do the XeVM targets have a sensible notion of a LLVM
triple
along side that ofchip
? How about (LLVM)features
? If so, it would probably make sense to have a#xevm.target
attribute, a la the proposed above, which implements theLLVMTargetAttrInterface
from this WIP upstream PR:https://github.com/llvm/llvm-project/pull/145899/files#diff-6c2503d165a7390c955c3c4fa4a76fd1991633b5ec597a1b1fd92731e1f3684dR572
I am in the process of making
#nvvm.target
and#rocdl.target
attributes implement this interface (in addition to the generic#llvm.target
attr in that PR). If sensible, the XeVM dialect should probably (try to) mirror this.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given this in https://github.com/llvm/llvm-project/pull/147372/files#diff-3da7b8747032bb581f119fd49037d06706443c3c5db56d4d28cc74053a9b754dR284 in the Add xevm-attach-target transform pass PR, I take it there's a sensible triple.
I will have a look at making
xevm.target
also implement my interface.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks Chao, Rolf, Artem.
You are right once XeVM target attribute gets upstreamed, we should use it. This pass is here just to show the use case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍
On that note, I found that the
#xevm.target
attr itself already got merged: https://github.com/llvm/llvm-project/blob/b9d7513bf134febe72c05a04ff20f87191d7213a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td#L521We can get cracking!