Skip to content

Commit f884993

Browse files
authored
[SYCL] Support per-object file compilation (#7595)
This change adds per-object compilation support for SYCL, also called non-relocatable device code mode. This is already supported in clang for HIP and CUDA. It adds a new option -f[no-]sycl-rdc. The default is -fsycl-rdc, which compiles code as today. Passing -fno-sycl-rdc activates the new mode. This is just an alias to the existing flag used by AMD/CUDA, f[no-]-gpu-rdc. The main implication is that we no longer link all device code together into one big module before post link. Instead, we execute all jobs after device linking on a per-object file basis. This means sycl-post-link and the later jobs execute multiple times, since we no longer have one big module. This can result in large improvement performance in the compiler runtime and memory usage, we see a max memory usage reduction for QUDA with -g from over 250GB to 4GB and a large compiler runtime improvement as well. Error cases: 1) Cross-object dependencies. Since we don't link device code together, each object file must be independent. I added an error in Sema to error if the user passes this flag and has cross-object dependencies. 2) Invalid architecture in fat object. We currently warn gracefully about this, in per-object-file mode llvm-foreach throws an error customers won't understand, so error out in that case instead of warning. Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
1 parent 33306d4 commit f884993

File tree

22 files changed

+607
-184
lines changed

22 files changed

+607
-184
lines changed

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,8 @@ def warn_drv_sycl_offload_target_duplicate : Warning<
352352
def warn_drv_sycl_target_missing : Warning<
353353
"linked binaries do not contain expected '%0' target; found targets: '%1'">,
354354
InGroup<SyclTarget>;
355+
def err_drv_no_rdc_sycl_target_missing : Error<
356+
"linked binaries do not contain expected '%0' target; found targets: '%1', this is not supported with '-fno-sycl-rdc'">;
355357
def err_drv_multiple_target_with_forced_target : Error<
356358
"multiple target usage with '%0' is not supported with '%1'">;
357359
def err_drv_failed_to_deduce_target_from_arch : Error<

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11790,6 +11790,9 @@ def err_sycl_restrict : Error<
1179011790
def err_sycl_external_global : Error<
1179111791
"invalid reference to 'device_global' variable; external 'device_global'"
1179211792
" variable must be marked with SYCL_EXTERNAL macro">;
11793+
def err_sycl_external_no_rdc : Error<
11794+
"invalid %select{declaration|definition}0 of SYCL_EXTERNAL function in non-relocatable "
11795+
"device code mode">;
1179311796
def warn_sycl_kernel_too_big_args : Warning<
1179411797
"size of kernel arguments (%0 bytes) may exceed the supported maximum "
1179511798
"of %1 bytes on some devices">, InGroup<SyclStrict>, ShowInSystemHeader;

clang/include/clang/Driver/Action.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "clang/Driver/Util.h"
1515
#include "llvm/ADT/ArrayRef.h"
1616
#include "llvm/ADT/STLExtras.h"
17+
#include "llvm/ADT/SetVector.h"
1718
#include "llvm/ADT/SmallVector.h"
1819
#include "llvm/ADT/StringRef.h"
1920
#include "llvm/ADT/iterator_range.h"
@@ -817,7 +818,8 @@ class FileTableTformJobAction : public JobAction {
817818
REPLACE,
818819
REPLACE_CELL,
819820
RENAME,
820-
COPY_SINGLE_FILE
821+
COPY_SINGLE_FILE,
822+
MERGE
821823
};
822824

823825
Tform() = default;
@@ -855,6 +857,10 @@ class FileTableTformJobAction : public JobAction {
855857
// output file.
856858
void addCopySingleFileTform(StringRef ColumnName, int Row);
857859

860+
// Merges all tables from filename listed at column <ColumnName> into a
861+
// single output table.
862+
void addMergeTform(StringRef ColumnName);
863+
858864
static bool classof(const Action *A) {
859865
return A->getKind() == FileTableTformJobClass;
860866
}
@@ -937,6 +943,14 @@ class ForEachWrappingAction : public Action {
937943
static bool classof(const Action *A) {
938944
return A->getKind() == ForEachWrappingClass;
939945
}
946+
947+
void addSerialAction(const Action *A) { SerialActions.insert(A); }
948+
const llvm::SmallSetVector<const Action *, 2> &getSerialActions() const {
949+
return SerialActions;
950+
}
951+
952+
private:
953+
llvm::SmallSetVector<const Action *, 2> SerialActions;
940954
};
941955

942956
} // namespace driver

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,8 +1005,9 @@ def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-
10051005
Alias<fno_gpu_flush_denormals_to_zero>;
10061006
defm gpu_rdc : BoolFOption<"gpu-rdc",
10071007
LangOpts<"GPURelocatableDeviceCode">, DefaultFalse,
1008-
PosFlag<SetTrue, [CC1Option], "Generate relocatable device code, also known as separate compilation mode">,
1009-
NegFlag<SetFalse>>;
1008+
PosFlag<SetTrue, [], "Generate relocatable device code, also known as separate compilation mode">,
1009+
NegFlag<SetFalse, []>,
1010+
BothFlags<[CC1Option]>>;
10101011
def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
10111012
def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
10121013
defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
@@ -2997,6 +2998,8 @@ def fsycl_max_parallel_jobs_EQ : Joined<["-"], "fsycl-max-parallel-link-jobs=">,
29972998
HelpText<"Experimental feature: Controls the maximum parallelism of actions performed "
29982999
"on SYCL device code post-link, i.e. the generation of SPIR-V device images "
29993000
"or AOT compilation of each device image.">;
3001+
def : Flag<["-"], "fsycl-rdc">, Alias<fgpu_rdc>;
3002+
def : Flag<["-"], "fno-sycl-rdc">, Alias<fno_gpu_rdc>;
30003003
def fsyntax_only : Flag<["-"], "fsyntax-only">,
30013004
Flags<[NoXarchOption,CoreOption,CC1Option,FC1Option,FlangOption]>, Group<Action_Group>,
30023005
HelpText<"Run the preprocessor, parser and semantic analysis stages">;

clang/lib/Driver/Action.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,10 @@ void FileTableTformJobAction::addCopySingleFileTform(StringRef ColumnName,
567567
Tform(Tform::COPY_SINGLE_FILE, {ColumnName, std::to_string(Row)}));
568568
}
569569

570+
void FileTableTformJobAction::addMergeTform(StringRef ColumnName) {
571+
Tforms.emplace_back(Tform(Tform::MERGE, {ColumnName}));
572+
}
573+
570574
void AppendFooterJobAction::anchor() {}
571575

572576
AppendFooterJobAction::AppendFooterJobAction(Action *Input, types::ID Type)

clang/lib/Driver/Driver.cpp

Lines changed: 290 additions & 146 deletions
Large diffs are not rendered by default.

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4782,7 +4782,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
47824782
options::OPT_no_offload_new_driver, false));
47834783

47844784
bool IsRDCMode =
4785-
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4785+
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, IsSYCL);
47864786
bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction);
47874787
auto LTOMode = D.getLTOMode(IsDeviceOffloadAction);
47884788
bool IsFPGASYCLOffloadDevice =
@@ -6920,9 +6920,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
69206920
options::OPT_fno_hip_kernel_arg_name);
69216921
}
69226922

6923-
if (IsCuda || IsHIP) {
6923+
if (IsCuda || IsHIP || IsSYCL) {
69246924
if (IsRDCMode)
69256925
CmdArgs.push_back("-fgpu-rdc");
6926+
else
6927+
CmdArgs.push_back("-fno-gpu-rdc");
6928+
}
6929+
if (IsCuda || IsHIP) {
69266930
if (Args.hasFlag(options::OPT_fgpu_defer_diag,
69276931
options::OPT_fno_gpu_defer_diag, false))
69286932
CmdArgs.push_back("-fgpu-defer-diag");
@@ -9770,6 +9774,13 @@ void FileTableTform::ConstructJob(Compilation &C, const JobAction &JA,
97709774
addArgs(CmdArgs, TCArgs, {Arg});
97719775
break;
97729776
}
9777+
case FileTableTformJobAction::Tform::MERGE: {
9778+
assert(Tf.TheArgs.size() == 1 && "column name expected");
9779+
SmallString<128> Arg("-merge=");
9780+
Arg += Tf.TheArgs[0];
9781+
addArgs(CmdArgs, TCArgs, {Arg});
9782+
break;
9783+
}
97739784
}
97749785
}
97759786

clang/lib/Driver/ToolChains/SYCL.cpp

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ void SYCL::constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
127127
C.addCommand(std::move(Cmd));
128128
}
129129

130+
bool SYCL::shouldDoPerObjectFileLinking(const Compilation &C) {
131+
return !C.getArgs().hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
132+
/*default=*/true);
133+
}
134+
130135
// The list should match pre-built SYCL device library files located in
131136
// compiler package. Once we add or remove any SYCL device library files,
132137
// the list should be updated accordingly.
@@ -163,12 +168,25 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
163168
// an actual object/archive. Take that list and pass those to the linker
164169
// instead of the original object.
165170
if (JA.isDeviceOffloading(Action::OFK_SYCL)) {
166-
auto isSYCLDeviceLib = [&C, this](const InputInfo &II) {
171+
bool IsRDC = !shouldDoPerObjectFileLinking(C);
172+
auto isNoRDCDeviceCodeLink = [&](const InputInfo &II) {
173+
if (IsRDC)
174+
return false;
175+
if (II.getType() != clang::driver::types::TY_LLVM_BC)
176+
return false;
177+
if (InputFiles.size() != 2)
178+
return false;
179+
return &II == &InputFiles[1];
180+
};
181+
auto isSYCLDeviceLib = [&](const InputInfo &II) {
167182
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
168183
StringRef LibPostfix = ".o";
169184
if (HostTC->getTriple().isWindowsMSVCEnvironment() &&
170185
C.getDriver().IsCLMode())
171186
LibPostfix = ".obj";
187+
else if (isNoRDCDeviceCodeLink(II))
188+
LibPostfix = ".bc";
189+
172190
std::string FileName = this->getToolChain().getInputFilename(II);
173191
StringRef InputFilename = llvm::sys::path::filename(FileName);
174192
if (this->getToolChain().getTriple().isNVPTX()) {
@@ -183,9 +201,21 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
183201
!InputFilename.endswith(LibPostfix) || (InputFilename.count('-') < 2))
184202
return false;
185203
// Skip the prefix "libsycl-"
186-
StringRef PureLibName = InputFilename.substr(LibSyclPrefix.size());
204+
std::string PureLibName =
205+
InputFilename.substr(LibSyclPrefix.size()).str();
206+
if (isNoRDCDeviceCodeLink(II)) {
207+
// Skip the final - until the . because we linked all device libs into a
208+
// single BC in a previous action so we have a temp file name.
209+
auto FinalDashPos = PureLibName.find_last_of('-');
210+
auto DotPos = PureLibName.find_last_of('.');
211+
assert((FinalDashPos != std::string::npos &&
212+
DotPos != std::string::npos) &&
213+
"Unexpected filename");
214+
PureLibName =
215+
PureLibName.substr(0, FinalDashPos) + PureLibName.substr(DotPos);
216+
}
187217
for (const auto &L : SYCLDeviceLibList) {
188-
if (PureLibName.startswith(L))
218+
if (StringRef(PureLibName).startswith(L))
189219
return true;
190220
}
191221
return false;
@@ -203,8 +233,17 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
203233
for (const auto &II : InputFiles) {
204234
std::string FileName = getToolChain().getInputFilename(II);
205235
if (II.getType() == types::TY_Tempfilelist) {
206-
// Pass the unbundled list with '@' to be processed.
207-
Libs.push_back(C.getArgs().MakeArgString("@" + FileName));
236+
if (IsRDC) {
237+
// Pass the unbundled list with '@' to be processed.
238+
Libs.push_back(C.getArgs().MakeArgString("@" + FileName));
239+
} else {
240+
assert(InputFiles.size() == 2 &&
241+
"Unexpected inputs for no-RDC with temp file list");
242+
// If we're in no-RDC mode and the input is a temp file list,
243+
// we want to link multiple object files each against device libs,
244+
// so we should consider this input as an object and not pass '@'.
245+
Objs.push_back(C.getArgs().MakeArgString(FileName));
246+
}
208247
} else if (II.getType() == types::TY_Archive && !LinkSYCLDeviceLibs) {
209248
Libs.push_back(C.getArgs().MakeArgString(FileName));
210249
} else

clang/lib/Driver/ToolChains/SYCL.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ void constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
3838
const InputInfo &Output, const Tool *T,
3939
StringRef Increment, StringRef Ext = "out",
4040
StringRef ParallelJobs = "");
41-
41+
bool shouldDoPerObjectFileLinking(const Compilation &C);
4242
// Runs llvm-spirv to convert spirv to bc, llvm-link, which links multiple LLVM
4343
// bitcode. Converts generated bc back to spirv using llvm-spirv, wraps with
4444
// offloading information. Finally compiles to object using llc

clang/lib/Frontend/CompilerInvocation.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3613,6 +3613,9 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
36133613

36143614
if (!Opts.RandstructSeed.empty())
36153615
GenerateArg(Args, OPT_frandomize_layout_seed_EQ, Opts.RandstructSeed, SA);
3616+
3617+
if (!Opts.GPURelocatableDeviceCode)
3618+
GenerateArg(Args, OPT_fno_gpu_rdc, SA);
36163619
}
36173620

36183621
bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
@@ -4208,6 +4211,11 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
42084211
Diags.Report(diag::err_drv_hlsl_unsupported_target) << T.str();
42094212
}
42104213

4214+
// GPURelocatableDeviceCode should be true for SYCL if not specified.
4215+
if (Args.hasArg(OPT_fsycl_is_device) || Args.hasArg(OPT_fsycl_is_host))
4216+
Opts.GPURelocatableDeviceCode = Args.hasFlag(
4217+
options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, /*default=*/true);
4218+
42114219
return Diags.getNumErrors() == NumErrorsBefore;
42124220
}
42134221

0 commit comments

Comments
 (0)